Merge branch 'master' of github.com:ClickHouse/ClickHouse into insert-cluster

2024-11-30 03:22:14 +00:00 · 2020-12-24 07:05:07 +00:00 · 2020-12-24 07:05:07 +00:00 · 714a4f8f1c
commit 714a4f8f1c
parent a6429cc416 a58acbae67
329 changed files with 8110 additions and 2037 deletions
--- a/.github/workflows/anchore-analysis.yml
+++ b/.github/workflows/anchore-analysis.yml
@ -1,8 +1,8 @@
 # This workflow checks out code, performs an Anchore container image
 # vulnerability and compliance scan, and integrates the results with
-# GitHub Advanced Security code scanning feature.  For more information on
+# GitHub Advanced Security code scanning feature. For more information on
 # the Anchore scan action usage and parameters, see
-# https://github.com/anchore/scan-action.  For more information on
+# https://github.com/anchore/scan-action. For more information on
 # Anchore container image scanning in general, see
 # https://docs.anchore.com.

@ -28,18 +28,12 @@ jobs:
        perl -pi -e 's|=\$version||g' Dockerfile
        docker build . --file Dockerfile --tag localbuild/testimage:latest      
    - name: Run the local Anchore scan action itself with GitHub Advanced Security code scanning integration enabled
-      uses: anchore/scan-action@master
+      uses: anchore/scan-action@v2
+      id: scan
      with:
-        image-reference: "localbuild/testimage:latest"
-        dockerfile-path: "docker/server/Dockerfile"
+        image: "localbuild/testimage:latest"
        acs-report-enable: true
-        fail-build: true
-    - name: Upload artifact
-      uses: actions/upload-artifact@v1.0.0
-      with:
-        name: AnchoreReports
-        path: ./anchore-reports/
    - name: Upload Anchore Scan Report
      uses: github/codeql-action/upload-sarif@v1
      with:
-        sarif_file: results.sarif
+        sarif_file: ${{ steps.scan.outputs.sarif }}
--- a/.gitmodules
+++ b/.gitmodules
@ -53,7 +53,8 @@
 	url = https://github.com/ClickHouse-Extras/Turbo-Base64.git
 [submodule "contrib/arrow"]
 	path = contrib/arrow
-	url = https://github.com/apache/arrow
+	url = https://github.com/ClickHouse-Extras/arrow
+	branch = clickhouse-arrow-2.0.0
 [submodule "contrib/thrift"]
 	path = contrib/thrift
 	url = https://github.com/apache/thrift.git
--- a/base/common/ReplxxLineReader.cpp
+++ b/base/common/ReplxxLineReader.cpp
@ -6,6 +6,12 @@
 #include <unistd.h>
 #include <functional>
 #include <sys/file.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <csignal>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <fstream>

 namespace
 {
@ -83,6 +89,8 @@ ReplxxLineReader::ReplxxLineReader(
    /// it also binded to M-p/M-n).
    rx.bind_key(Replxx::KEY::meta('N'), [this](char32_t code) { return rx.invoke(Replxx::ACTION::COMPLETE_NEXT, code); });
    rx.bind_key(Replxx::KEY::meta('P'), [this](char32_t code) { return rx.invoke(Replxx::ACTION::COMPLETE_PREVIOUS, code); });
+
+    rx.bind_key(Replxx::KEY::meta('E'), [this](char32_t) { openEditor(); return Replxx::ACTION_RESULT::CONTINUE; });
 }

 ReplxxLineReader::~ReplxxLineReader()
@ -127,7 +135,114 @@ void ReplxxLineReader::addToHistory(const String & line)
        rx.print("Unlock of history file failed: %s\n", errnoToString(errno).c_str());
 }

+int ReplxxLineReader::execute(const std::string & command)
+{
+    std::vector<char> argv0("sh", &("sh"[3]));
+    std::vector<char> argv1("-c", &("-c"[3]));
+    std::vector<char> argv2(command.data(), command.data() + command.size() + 1);
+
+    const char * filename = "/bin/sh";
+    char * const argv[] = {argv0.data(), argv1.data(), argv2.data(), nullptr};
+
+    static void * real_vfork = dlsym(RTLD_DEFAULT, "vfork");
+    if (!real_vfork)
+    {
+        rx.print("Cannot find symbol vfork in myself: %s\n", errnoToString(errno).c_str());
+        return -1;
+    }
+
+    pid_t pid = reinterpret_cast<pid_t (*)()>(real_vfork)();
+
+    if (-1 == pid)
+    {
+        rx.print("Cannot vfork: %s\n", errnoToString(errno).c_str());
+        return -1;
+    }
+
+    if (0 == pid)
+    {
+        sigset_t mask;
+        sigemptyset(&mask);
+        sigprocmask(0, nullptr, &mask);
+        sigprocmask(SIG_UNBLOCK, &mask, nullptr);
+
+        execv(filename, argv);
+        _exit(-1);
+    }
+
+    int status = 0;
+    if (-1 == waitpid(pid, &status, 0))
+    {
+        rx.print("Cannot waitpid: %s\n", errnoToString(errno).c_str());
+        return -1;
+    }
+    return status;
+}
+
+void ReplxxLineReader::openEditor()
+{
+    char filename[] = "clickhouse_replxx_XXXXXX.sql";
+    int fd = ::mkstemps(filename, 4);
+    if (-1 == fd)
+    {
+        rx.print("Cannot create temporary file to edit query: %s\n", errnoToString(errno).c_str());
+        return;
+    }
+
+    String editor = std::getenv("EDITOR");
+    if (editor.empty())
+        editor = "vim";
+
+    replxx::Replxx::State state(rx.get_state());
+
+    size_t bytes_written = 0;
+    const char * begin = state.text();
+    size_t offset = strlen(state.text());
+    while (bytes_written != offset)
+    {
+        ssize_t res = ::write(fd, begin + bytes_written, offset - bytes_written);
+        if ((-1 == res || 0 == res) && errno != EINTR)
+        {
+            rx.print("Cannot write to temporary query file %s: %s\n", filename, errnoToString(errno).c_str());
+            return;
+        }
+        bytes_written += res;
+    }
+
+    if (0 != ::close(fd))
+    {
+        rx.print("Cannot close temporary query file %s: %s\n", filename, errnoToString(errno).c_str());
+        return;
+    }
+
+    if (0 == execute(editor + " " + filename))
+    {
+        try
+        {
+            std::ifstream t(filename);
+            std::string str;
+            t.seekg(0, std::ios::end);
+            str.reserve(t.tellg());
+            t.seekg(0, std::ios::beg);
+            str.assign((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>());
+            rx.set_state(replxx::Replxx::State(str.c_str(), str.size()));
+        }
+        catch (...)
+        {
+            rx.print("Cannot read from temporary query file %s: %s\n", filename, errnoToString(errno).c_str());
+            return;
+        }
+    }
+
+    if (bracketed_paste_enabled)
+        enableBracketedPaste();
+
+    if (0 != ::unlink(filename))
+        rx.print("Cannot remove temporary query file %s: %s\n", filename, errnoToString(errno).c_str());
+}
+
 void ReplxxLineReader::enableBracketedPaste()
 {
+    bracketed_paste_enabled = true;
    rx.enable_bracketed_paste();
 };
--- a/base/common/ReplxxLineReader.h
+++ b/base/common/ReplxxLineReader.h
@ -22,10 +22,13 @@ public:
 private:
    InputStatus readOneLine(const String & prompt) override;
    void addToHistory(const String & line) override;
+    int execute(const std::string & command);
+    void openEditor();

    replxx::Replxx rx;
    replxx::Replxx::highlighter_callback_t highlighter;

    // used to call flock() to synchronize multiple clients using same history file
    int history_file_fd = -1;
+    bool bracketed_paste_enabled = false;
 };
--- a/base/common/defines.h
+++ b/base/common/defines.h
@ -61,6 +61,20 @@
 #    endif
 #endif

+#if defined(ADDRESS_SANITIZER)
+#    define BOOST_USE_ASAN 1
+#    define BOOST_USE_UCONTEXT 1
+#endif
+
+#if defined(THREAD_SANITIZER)
+#    define BOOST_USE_TSAN 1
+#    define BOOST_USE_UCONTEXT 1
+#endif
+
+#if defined(ARCADIA_BUILD) && defined(BOOST_USE_UCONTEXT)
+#    undef BOOST_USE_UCONTEXT
+#endif
+
 /// TODO: Strange enough, there is no way to detect UB sanitizer.

 /// Explicitly allow undefined behaviour for certain functions. Use it as a function attribute.
--- a/base/common/wide_integer_impl.h
+++ b/base/common/wide_integer_impl.h
@ -5,9 +5,11 @@
 /// (See at http://www.boost.org/LICENSE_1_0.txt)

 #include "throwError.h"
+#include <cmath>
 #include <cfloat>
-#include <limits>
 #include <cassert>
+#include <limits>
+

 namespace wide
 {
@ -239,6 +241,14 @@ struct integer<Bits, Signed>::_impl
    template <class T>
    constexpr static void set_multiplier(integer<Bits, Signed> & self, T t) noexcept {
        constexpr uint64_t max_int = std::numeric_limits<uint64_t>::max();
+
+        /// Implementation specific behaviour on overflow (if we don't check here, stack overflow will triggered in bigint_cast).
+        if (!std::isfinite(t))
+        {
+            self = 0;
+            return;
+        }
+
        const T alpha = t / max_int;

        if (alpha <= max_int)
--- a/base/daemon/BaseDaemon.cpp
+++ b/base/daemon/BaseDaemon.cpp
@ -4,6 +4,11 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/resource.h>
+#if defined(__linux__)
+    #include <sys/prctl.h>
+#endif
 #include <fcntl.h>
 #include <errno.h>
 #include <string.h>
@ -12,7 +17,6 @@
 #include <unistd.h>

 #include <typeinfo>
-#include <sys/resource.h>
 #include <iostream>
 #include <fstream>
 #include <sstream>
@ -22,7 +26,6 @@
 #include <Poco/Observer.h>
 #include <Poco/AutoPtr.h>
 #include <Poco/PatternFormatter.h>
-#include <Poco/TaskManager.h>
 #include <Poco/File.h>
 #include <Poco/Path.h>
 #include <Poco/Message.h>
@ -470,7 +473,6 @@ BaseDaemon::~BaseDaemon()

 void BaseDaemon::terminate()
 {
-    getTaskManager().cancelAll();
    if (::raise(SIGTERM) != 0)
        throw Poco::SystemException("cannot terminate process");
 }
@ -478,22 +480,11 @@ void BaseDaemon::terminate()
 void BaseDaemon::kill()
 {
    dumpCoverageReportIfPossible();
-    pid.reset();
+    pid_file.reset();
    if (::raise(SIGKILL) != 0)
        throw Poco::SystemException("cannot kill process");
 }

-void BaseDaemon::sleep(double seconds)
-{
-    wakeup_event.reset();
-    wakeup_event.tryWait(seconds * 1000);
-}
-
-void BaseDaemon::wakeup()
-{
-    wakeup_event.set();
-}
-
 std::string BaseDaemon::getDefaultCorePath() const
 {
    return "/opt/cores/";
@ -564,7 +555,6 @@ void BaseDaemon::initialize(Application & self)
 {
    closeFDs();

-    task_manager = std::make_unique<Poco::TaskManager>();
    ServerApplication::initialize(self);

    /// now highest priority (lowest value) is PRIO_APPLICATION = -100, we want higher!
@ -648,10 +638,6 @@ void BaseDaemon::initialize(Application & self)
            throw Poco::OpenFileException("Cannot attach stdout to " + stdout_path);
    }

-    /// Create pid file.
-    if (config().has("pid"))
-        pid.emplace(config().getString("pid"), DB::StatusFile::write_pid);
-
    /// Change path for logging.
    if (!log_path.empty())
    {
@ -667,9 +653,17 @@ void BaseDaemon::initialize(Application & self)
            throw Poco::Exception("Cannot change directory to /tmp");
    }

-    // sensitive data masking rules are not used here
+    /// sensitive data masking rules are not used here
    buildLoggers(config(), logger(), self.commandName());

+    /// After initialized loggers but before initialized signal handling.
+    if (should_setup_watchdog)
+        setupWatchdog();
+
+    /// Create pid file.
+    if (config().has("pid"))
+        pid_file.emplace(config().getString("pid"), DB::StatusFile::write_pid);
+
    if (is_daemon)
    {
        /** Change working directory to the directory to write core dumps.
@ -704,54 +698,71 @@ void BaseDaemon::initialize(Application & self)
 }


+static void addSignalHandler(const std::vector<int> & signals, signal_function handler, std::vector<int> * out_handled_signals)
+{
+    struct sigaction sa;
+    memset(&sa, 0, sizeof(sa));
+    sa.sa_sigaction = handler;
+    sa.sa_flags = SA_SIGINFO;
+
+#if defined(OS_DARWIN)
+    sigemptyset(&sa.sa_mask);
+    for (auto signal : signals)
+        sigaddset(&sa.sa_mask, signal);
+#else
+    if (sigemptyset(&sa.sa_mask))
+        throw Poco::Exception("Cannot set signal handler.");
+
+    for (auto signal : signals)
+        if (sigaddset(&sa.sa_mask, signal))
+            throw Poco::Exception("Cannot set signal handler.");
+#endif
+
+    for (auto signal : signals)
+        if (sigaction(signal, &sa, nullptr))
+            throw Poco::Exception("Cannot set signal handler.");
+
+    if (out_handled_signals)
+        std::copy(signals.begin(), signals.end(), std::back_inserter(*out_handled_signals));
+};
+
+
+static void blockSignals(const std::vector<int> & signals)
+{
+    sigset_t sig_set;
+
+#if defined(OS_DARWIN)
+    sigemptyset(&sig_set);
+    for (auto signal : signals)
+        sigaddset(&sig_set, signal);
+#else
+    if (sigemptyset(&sig_set))
+        throw Poco::Exception("Cannot block signal.");
+
+    for (auto signal : signals)
+        if (sigaddset(&sig_set, signal))
+            throw Poco::Exception("Cannot block signal.");
+#endif
+
+    if (pthread_sigmask(SIG_BLOCK, &sig_set, nullptr))
+        throw Poco::Exception("Cannot block signal.");
+};
+
+
 void BaseDaemon::initializeTerminationAndSignalProcessing()
 {
    SentryWriter::initialize(config());
    std::set_terminate(terminate_handler);

    /// We want to avoid SIGPIPE when working with sockets and pipes, and just handle return value/errno instead.
-    {
-        sigset_t sig_set;
-        if (sigemptyset(&sig_set) || sigaddset(&sig_set, SIGPIPE) || pthread_sigmask(SIG_BLOCK, &sig_set, nullptr))
-            throw Poco::Exception("Cannot block signal.");
-    }
+    blockSignals({SIGPIPE});

    /// Setup signal handlers.
-    auto add_signal_handler =
-        [this](const std::vector<int> & signals, signal_function handler)
-        {
-            struct sigaction sa;
-            memset(&sa, 0, sizeof(sa));
-            sa.sa_sigaction = handler;
-            sa.sa_flags = SA_SIGINFO;
-
-            {
-#if defined(OS_DARWIN)
-                sigemptyset(&sa.sa_mask);
-                for (auto signal : signals)
-                    sigaddset(&sa.sa_mask, signal);
-#else
-                if (sigemptyset(&sa.sa_mask))
-                    throw Poco::Exception("Cannot set signal handler.");
-
-                for (auto signal : signals)
-                    if (sigaddset(&sa.sa_mask, signal))
-                        throw Poco::Exception("Cannot set signal handler.");
-#endif
-
-                for (auto signal : signals)
-                    if (sigaction(signal, &sa, nullptr))
-                        throw Poco::Exception("Cannot set signal handler.");
-
-                std::copy(signals.begin(), signals.end(), std::back_inserter(handled_signals));
-            }
-        };
-
    /// SIGTSTP is added for debugging purposes. To output a stack trace of any running thread at anytime.

-    add_signal_handler({SIGABRT, SIGSEGV, SIGILL, SIGBUS, SIGSYS, SIGFPE, SIGPIPE, SIGTSTP}, signalHandler);
-    add_signal_handler({SIGHUP, SIGUSR1}, closeLogsSignalHandler);
-    add_signal_handler({SIGINT, SIGQUIT, SIGTERM}, terminateRequestedSignalHandler);
+    addSignalHandler({SIGABRT, SIGSEGV, SIGILL, SIGBUS, SIGSYS, SIGFPE, SIGPIPE, SIGTSTP}, signalHandler, &handled_signals);
+    addSignalHandler({SIGHUP, SIGUSR1}, closeLogsSignalHandler, &handled_signals);
+    addSignalHandler({SIGINT, SIGQUIT, SIGTERM}, terminateRequestedSignalHandler, &handled_signals);

 #if defined(SANITIZER)
    __sanitizer_set_death_callback(sanitizerDeathCallback);
@ -786,23 +797,6 @@ void BaseDaemon::logRevision() const
        + ", PID " + std::to_string(getpid()));
 }

-/// Makes server shutdown if at least one Poco::Task have failed.
-void BaseDaemon::exitOnTaskError()
-{
-    Poco::Observer<BaseDaemon, Poco::TaskFailedNotification> obs(*this, &BaseDaemon::handleNotification);
-    getTaskManager().addObserver(obs);
-}
-
-/// Used for exitOnTaskError()
-void BaseDaemon::handleNotification(Poco::TaskFailedNotification *_tfn)
-{
-    task_failed = true;
-    Poco::AutoPtr<Poco::TaskFailedNotification> fn(_tfn);
-    Poco::Logger * lg = &(logger());
-    LOG_ERROR(lg, "Task '{}' failed. Daemon is shutting down. Reason - {}", fn->task()->name(), fn->reason().displayText());
-    ServerApplication::terminate();
-}
-
 void BaseDaemon::defineOptions(Poco::Util::OptionSet & new_options)
 {
    new_options.addOption(
@ -863,13 +857,144 @@ void BaseDaemon::onInterruptSignals(int signal_id)
    if (sigint_signals_counter >= 2)
    {
        LOG_INFO(&logger(), "Received second signal Interrupt. Immediately terminate.");
-        kill();
+        call_default_signal_handler(signal_id);
+        /// If the above did not help.
+        _exit(128 + signal_id);
    }
 }


 void BaseDaemon::waitForTerminationRequest()
 {
+    /// NOTE: as we already process signals via pipe, we don't have to block them with sigprocmask in threads
    std::unique_lock<std::mutex> lock(signal_handler_mutex);
    signal_event.wait(lock, [this](){ return terminate_signals_counter > 0; });
 }
+
+
+void BaseDaemon::shouldSetupWatchdog(char * argv0_)
+{
+    should_setup_watchdog = true;
+    argv0 = argv0_;
+}
+
+
+void BaseDaemon::setupWatchdog()
+{
+    /// Initialize in advance to avoid double initialization in forked processes.
+    DateLUT::instance();
+
+    std::string original_process_name;
+    if (argv0)
+        original_process_name = argv0;
+
+    while (true)
+    {
+        static pid_t pid = -1;
+        pid = fork();
+
+        if (-1 == pid)
+            throw Poco::Exception("Cannot fork");
+
+        if (0 == pid)
+        {
+            logger().information("Forked a child process to watch");
+#if defined(__linux__)
+            if (0 != prctl(PR_SET_PDEATHSIG, SIGKILL))
+                logger().warning("Cannot do prctl to ask termination with parent.");
+#endif
+            return;
+        }
+
+        /// Change short thread name and process name.
+        setThreadName("clckhouse-watch");   /// 15 characters
+
+        if (argv0)
+        {
+            const char * new_process_name = "clickhouse-watchdog";
+            memset(argv0, 0, original_process_name.size());
+            memcpy(argv0, new_process_name, std::min(strlen(new_process_name), original_process_name.size()));
+        }
+
+        logger().information(fmt::format("Will watch for the process with pid {}", pid));
+
+        /// Forward signals to the child process.
+        addSignalHandler(
+            {SIGHUP, SIGUSR1, SIGINT, SIGQUIT, SIGTERM},
+            [](int sig, siginfo_t *, void *)
+            {
+                /// Forward all signals except INT as it can be send by terminal to the process group when user press Ctrl+C,
+                /// and we process double delivery of this signal as immediate termination.
+                if (sig == SIGINT)
+                    return;
+
+                const char * error_message = "Cannot forward signal to the child process.\n";
+                if (0 != ::kill(pid, sig))
+                {
+                    auto res = write(STDERR_FILENO, error_message, strlen(error_message));
+                    (void)res;
+                }
+            },
+            nullptr);
+
+        int status = 0;
+        do
+        {
+            if (-1 != waitpid(pid, &status, WUNTRACED | WCONTINUED) || errno == ECHILD)
+            {
+                if (WIFSTOPPED(status))
+                    logger().warning(fmt::format("Child process was stopped by signal {}.", WSTOPSIG(status)));
+                else if (WIFCONTINUED(status))
+                    logger().warning(fmt::format("Child process was continued."));
+                else
+                    break;
+            }
+            else if (errno != EINTR)
+                throw Poco::Exception("Cannot waitpid, errno: " + std::string(strerror(errno)));
+        } while (true);
+
+        if (errno == ECHILD)
+        {
+            logger().information("Child process no longer exists.");
+            _exit(status);
+        }
+
+        if (WIFEXITED(status))
+        {
+            logger().information(fmt::format("Child process exited normally with code {}.", WEXITSTATUS(status)));
+            _exit(status);
+        }
+
+        if (WIFSIGNALED(status))
+        {
+            int sig = WTERMSIG(status);
+
+            if (sig == SIGKILL)
+            {
+                logger().fatal(fmt::format("Child process was terminated by signal {} (KILL)."
+                    " If it is not done by 'forcestop' command or manually,"
+                    " the possible cause is OOM Killer (see 'dmesg' and look at the '/var/log/kern.log' for the details).", sig));
+            }
+            else
+            {
+                logger().fatal(fmt::format("Child process was terminated by signal {}.", sig));
+
+                if (sig == SIGINT || sig == SIGTERM || sig == SIGQUIT)
+                    _exit(status);
+            }
+        }
+        else
+        {
+            logger().fatal("Child process was not exited normally by unknown reason.");
+        }
+
+        /// Automatic restart is not enabled but you can play with it.
+#if 1
+        _exit(status);
+#else
+        logger().information("Will restart.");
+        if (argv0)
+            memcpy(argv0, original_process_name.c_str(), original_process_name.size());
+#endif
+    }
+}
--- a/base/daemon/BaseDaemon.h
+++ b/base/daemon/BaseDaemon.h
@ -12,7 +12,6 @@
 #include <chrono>
 #include <Poco/Process.h>
 #include <Poco/ThreadPool.h>
-#include <Poco/TaskNotification.h>
 #include <Poco/Util/Application.h>
 #include <Poco/Util/ServerApplication.h>
 #include <Poco/Net/SocketAddress.h>
@ -26,9 +25,6 @@
 #include <loggers/Loggers.h>


-namespace Poco { class TaskManager; }
-
-
 /// \brief Base class for applications that can run as daemons.
 ///
 /// \code
@ -52,31 +48,26 @@ public:
    BaseDaemon();
    ~BaseDaemon() override;

-    /// Загружает конфигурацию и "строит" логгеры на запись в файлы
+    /// Load configuration, prepare loggers, etc.
    void initialize(Poco::Util::Application &) override;

-    /// Читает конфигурацию
    void reloadConfiguration();

-    /// Определяет параметр командной строки
+    /// Process command line parameters
    void defineOptions(Poco::Util::OptionSet & new_options) override;

-    /// Заставляет демон завершаться, если хотя бы одна задача завершилась неудачно
-    void exitOnTaskError();
+    /// Graceful shutdown
+    static void terminate();

-    /// Завершение демона ("мягкое")
-    void terminate();
-
-    /// Завершение демона ("жёсткое")
+    /// Forceful shutdown
    void kill();

-    /// Получен ли сигнал на завершение?
+    /// Cancellation request has been received.
    bool isCancelled() const
    {
        return is_cancelled;
    }

-    /// Получение ссылки на экземпляр демона
    static BaseDaemon & instance()
    {
        return dynamic_cast<BaseDaemon &>(Poco::Util::Application::instance());
@ -85,12 +76,6 @@ public:
    /// return none if daemon doesn't exist, reference to the daemon otherwise
    static std::optional<std::reference_wrapper<BaseDaemon>> tryGetInstance() { return tryGetInstance<BaseDaemon>(); }

-    /// Спит заданное количество секунд или до события wakeup
-    void sleep(double seconds);
-
-    /// Разбудить
-    void wakeup();
-
    /// В Graphite компоненты пути(папки) разделяются точкой.
    /// У нас принят путь формата root_path.hostname_yandex_ru.key
    /// root_path по умолчанию one_min
@ -131,24 +116,23 @@ public:
    /// also doesn't close global internal pipes for signal handling
    static void closeFDs();

+    /// If this method is called after initialization and before run,
+    /// will fork child process and setup watchdog that will print diagnostic info, if the child terminates.
+    /// argv0 is needed to change process name (consequently, it is needed for scripts involving "pgrep", "pidof" to work correctly).
+    void shouldSetupWatchdog(char * argv0_);
+
 protected:
-    /// Возвращает TaskManager приложения
-    /// все методы task_manager следует вызывать из одного потока
-    /// иначе возможен deadlock, т.к. joinAll выполняется под локом, а любой метод тоже берет лок
-    Poco::TaskManager & getTaskManager() { return *task_manager; }
-
    virtual void logRevision() const;

-    /// Используется при exitOnTaskError()
-    void handleNotification(Poco::TaskFailedNotification *);
-
    /// thread safe
    virtual void handleSignal(int signal_id);

    /// initialize termination process and signal handlers
    virtual void initializeTerminationAndSignalProcessing();

-    /// реализация обработки сигналов завершения через pipe не требует блокировки сигнала с помощью sigprocmask во всех потоках
+    /// fork the main process and watch if it was killed
+    void setupWatchdog();
+
    void waitForTerminationRequest()
 #if defined(POCO_CLICKHOUSE_PATCH) || POCO_VERSION >= 0x02000000 // in old upstream poco not vitrual
    override
@ -162,21 +146,13 @@ protected:

    virtual std::string getDefaultCorePath() const;

-    std::unique_ptr<Poco::TaskManager> task_manager;
-
-    std::optional<DB::StatusFile> pid;
+    std::optional<DB::StatusFile> pid_file;

    std::atomic_bool is_cancelled{false};

-    /// Флаг устанавливается по сообщению из Task (при аварийном завершении).
-    bool task_failed = false;
-
    bool log_to_console = false;

-    /// Событие, чтобы проснуться во время ожидания
-    Poco::Event wakeup_event;
-
-    /// Поток, в котором принимается сигнал HUP/USR1 для закрытия логов.
+    /// A thread that acts on HUP and USR1 signal (close logs).
    Poco::Thread signal_listener_thread;
    std::unique_ptr<Poco::Runnable> signal_listener;

@ -194,6 +170,9 @@ protected:
    String build_id_info;

    std::vector<int> handled_signals;
+
+    bool should_setup_watchdog = false;
+    char * argv0 = nullptr;
 };


--- a/base/glibc-compatibility/musl/__polevll.c
+++ b/base/glibc-compatibility/musl/__polevll.c
@ -0,0 +1,93 @@
+/* origin: OpenBSD /usr/src/lib/libm/src/polevll.c */
+/*
+ * Copyright (c) 2008 Stephen L. Moshier <steve@moshier.net>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/*
+ *      Evaluate polynomial
+ *
+ *
+ * SYNOPSIS:
+ *
+ * int N;
+ * long double x, y, coef[N+1], polevl[];
+ *
+ * y = polevll( x, coef, N );
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Evaluates polynomial of degree N:
+ *
+ *                     2          N
+ * y  =  C  + C x + C x  +...+ C x
+ *        0    1     2          N
+ *
+ * Coefficients are stored in reverse order:
+ *
+ * coef[0] = C  , ..., coef[N] = C  .
+ *            N                   0
+ *
+ *  The function p1evll() assumes that coef[N] = 1.0 and is
+ * omitted from the array.  Its calling arguments are
+ * otherwise the same as polevll().
+ *
+ *
+ * SPEED:
+ *
+ * In the interest of speed, there are no checks for out
+ * of bounds arithmetic.  This routine is used by most of
+ * the functions in the library.  Depending on available
+ * equipment features, the user may wish to rewrite the
+ * program in microcode or assembly language.
+ *
+ */
+
+#include "libm.h"
+
+#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
+#else
+/*
+ * Polynomial evaluator:
+ *  P[0] x^n  +  P[1] x^(n-1)  +  ...  +  P[n]
+ */
+long double __polevll(long double x, const long double *P, int n)
+{
+	long double y;
+
+	y = *P++;
+	do {
+		y = y * x + *P++;
+	} while (--n);
+
+	return y;
+}
+
+/*
+ * Polynomial evaluator:
+ *  x^n  +  P[0] x^(n-1)  +  P[1] x^(n-2)  +  ...  +  P[n]
+ */
+long double __p1evll(long double x, const long double *P, int n)
+{
+	long double y;
+
+	n -= 1;
+	y = x + *P++;
+	do {
+		y = y * x + *P++;
+	} while (--n);
+
+	return y;
+}
+#endif
--- a/base/glibc-compatibility/musl/mkstemps.c
+++ b/base/glibc-compatibility/musl/mkstemps.c
@ -0,0 +1,44 @@
+#include <errno.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+
+/* This assumes that a check for the
+   template size has already been made */
+static char * __randname(char * template)
+{
+    int i;
+    struct timespec ts;
+    unsigned long r;
+
+    clock_gettime(CLOCK_REALTIME, &ts);
+    r = (ts.tv_nsec * 65537) ^ ((((intptr_t)(&ts)) / 16) + ((intptr_t)template));
+    for (i = 0; i < 6; i++, r >>= 5)
+        template[i] = 'A' + (r & 15) + (r & 16) * 2;
+
+    return template;
+}
+
+int mkstemps(char * template, int len)
+{
+    size_t l = strlen(template);
+    if (l < 6 || len > l - 6 || memcmp(template + l - len - 6, "XXXXXX", 6))
+    {
+        errno = EINVAL;
+        return -1;
+    }
+
+    int fd, retries = 100;
+    do
+    {
+        __randname(template + l - len - 6);
+        if ((fd = open(template, O_RDWR | O_CREAT | O_EXCL, 0600)) >= 0)
+            return fd;
+    } while (--retries && errno == EEXIST);
+
+    memcpy(template + l - len - 6, "XXXXXX", 6);
+    return -1;
+}
--- a/base/glibc-compatibility/musl/powf.c
+++ b/base/glibc-compatibility/musl/powf.c
@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2017-2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <math.h>
+#include <stdint.h>
+#include "libm.h"
+#include "exp2f_data.h"
+#include "powf_data.h"
+
+/*
+POWF_LOG2_POLY_ORDER = 5
+EXP2F_TABLE_BITS = 5
+
+ULP error: 0.82 (~ 0.5 + relerr*2^24)
+relerr: 1.27 * 2^-26 (Relative error ~= 128*Ln2*relerr_log2 + relerr_exp2)
+relerr_log2: 1.83 * 2^-33 (Relative error of logx.)
+relerr_exp2: 1.69 * 2^-34 (Relative error of exp2(ylogx).)
+*/
+
+#define N (1 << POWF_LOG2_TABLE_BITS)
+#define T __powf_log2_data.tab
+#define A __powf_log2_data.poly
+#define OFF 0x3f330000
+
+/* Subnormal input is normalized so ix has negative biased exponent.
+   Output is multiplied by N (POWF_SCALE) if TOINT_INTRINICS is set.  */
+static inline double_t log2_inline(uint32_t ix)
+{
+	double_t z, r, r2, r4, p, q, y, y0, invc, logc;
+	uint32_t iz, top, tmp;
+	int k, i;
+
+	/* x = 2^k z; where z is in range [OFF,2*OFF] and exact.
+	   The range is split into N subintervals.
+	   The ith subinterval contains z and c is near its center.  */
+	tmp = ix - OFF;
+	i = (tmp >> (23 - POWF_LOG2_TABLE_BITS)) % N;
+	top = tmp & 0xff800000;
+	iz = ix - top;
+	k = (int32_t)top >> (23 - POWF_SCALE_BITS); /* arithmetic shift */
+	invc = T[i].invc;
+	logc = T[i].logc;
+	z = (double_t)asfloat(iz);
+
+	/* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */
+	r = z * invc - 1;
+	y0 = logc + (double_t)k;
+
+	/* Pipelined polynomial evaluation to approximate log1p(r)/ln2.  */
+	r2 = r * r;
+	y = A[0] * r + A[1];
+	p = A[2] * r + A[3];
+	r4 = r2 * r2;
+	q = A[4] * r + y0;
+	q = p * r2 + q;
+	y = y * r4 + q;
+	return y;
+}
+
+#undef N
+#undef T
+#define N (1 << EXP2F_TABLE_BITS)
+#define T __exp2f_data.tab
+#define SIGN_BIAS (1 << (EXP2F_TABLE_BITS + 11))
+
+/* The output of log2 and thus the input of exp2 is either scaled by N
+   (in case of fast toint intrinsics) or not.  The unscaled xd must be
+   in [-1021,1023], sign_bias sets the sign of the result.  */
+static inline float exp2_inline(double_t xd, uint32_t sign_bias)
+{
+	uint64_t ki, ski, t;
+	double_t kd, z, r, r2, y, s;
+
+#if TOINT_INTRINSICS
+#define C __exp2f_data.poly_scaled
+	/* N*x = k + r with r in [-1/2, 1/2] */
+	kd = roundtoint(xd); /* k */
+	ki = converttoint(xd);
+#else
+#define C __exp2f_data.poly
+#define SHIFT __exp2f_data.shift_scaled
+	/* x = k/N + r with r in [-1/(2N), 1/(2N)] */
+	kd = eval_as_double(xd + SHIFT);
+	ki = asuint64(kd);
+	kd -= SHIFT; /* k/N */
+#endif
+	r = xd - kd;
+
+	/* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */
+	t = T[ki % N];
+	ski = ki + sign_bias;
+	t += ski << (52 - EXP2F_TABLE_BITS);
+	s = asdouble(t);
+	z = C[0] * r + C[1];
+	r2 = r * r;
+	y = C[2] * r + 1;
+	y = z * r2 + y;
+	y = y * s;
+	return eval_as_float(y);
+}
+
+/* Returns 0 if not int, 1 if odd int, 2 if even int.  The argument is
+   the bit representation of a non-zero finite floating-point value.  */
+static inline int checkint(uint32_t iy)
+{
+	int e = iy >> 23 & 0xff;
+	if (e < 0x7f)
+		return 0;
+	if (e > 0x7f + 23)
+		return 2;
+	if (iy & ((1 << (0x7f + 23 - e)) - 1))
+		return 0;
+	if (iy & (1 << (0x7f + 23 - e)))
+		return 1;
+	return 2;
+}
+
+static inline int zeroinfnan(uint32_t ix)
+{
+	return 2 * ix - 1 >= 2u * 0x7f800000 - 1;
+}
+
+float powf(float x, float y)
+{
+	uint32_t sign_bias = 0;
+	uint32_t ix, iy;
+
+	ix = asuint(x);
+	iy = asuint(y);
+	if (predict_false(ix - 0x00800000 >= 0x7f800000 - 0x00800000 ||
+			  zeroinfnan(iy))) {
+		/* Either (x < 0x1p-126 or inf or nan) or (y is 0 or inf or nan).  */
+		if (predict_false(zeroinfnan(iy))) {
+			if (2 * iy == 0)
+				return issignalingf_inline(x) ? x + y : 1.0f;
+			if (ix == 0x3f800000)
+				return issignalingf_inline(y) ? x + y : 1.0f;
+			if (2 * ix > 2u * 0x7f800000 ||
+			    2 * iy > 2u * 0x7f800000)
+				return x + y;
+			if (2 * ix == 2 * 0x3f800000)
+				return 1.0f;
+			if ((2 * ix < 2 * 0x3f800000) == !(iy & 0x80000000))
+				return 0.0f; /* |x|<1 && y==inf or |x|>1 && y==-inf.  */
+			return y * y;
+		}
+		if (predict_false(zeroinfnan(ix))) {
+			float_t x2 = x * x;
+			if (ix & 0x80000000 && checkint(iy) == 1)
+				x2 = -x2;
+			/* Without the barrier some versions of clang hoist the 1/x2 and
+			   thus division by zero exception can be signaled spuriously.  */
+			return iy & 0x80000000 ? fp_barrierf(1 / x2) : x2;
+		}
+		/* x and y are non-zero finite.  */
+		if (ix & 0x80000000) {
+			/* Finite x < 0.  */
+			int yint = checkint(iy);
+			if (yint == 0)
+				return __math_invalidf(x);
+			if (yint == 1)
+				sign_bias = SIGN_BIAS;
+			ix &= 0x7fffffff;
+		}
+		if (ix < 0x00800000) {
+			/* Normalize subnormal x so exponent becomes negative.  */
+			ix = asuint(x * 0x1p23f);
+			ix &= 0x7fffffff;
+			ix -= 23 << 23;
+		}
+	}
+	double_t logx = log2_inline(ix);
+	double_t ylogx = y * logx; /* cannot overflow, y is single prec.  */
+	if (predict_false((asuint64(ylogx) >> 47 & 0xffff) >=
+			  asuint64(126.0 * POWF_SCALE) >> 47)) {
+		/* |y*log(x)| >= 126.  */
+		if (ylogx > 0x1.fffffffd1d571p+6 * POWF_SCALE)
+			return __math_oflowf(sign_bias);
+		if (ylogx <= -150.0 * POWF_SCALE)
+			return __math_uflowf(sign_bias);
+	}
+	return exp2_inline(ylogx, sign_bias);
+}
--- a/base/glibc-compatibility/musl/powf_data.c
+++ b/base/glibc-compatibility/musl/powf_data.c
@ -0,0 +1,34 @@
+/*
+ * Data definition for powf.
+ *
+ * Copyright (c) 2017-2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "powf_data.h"
+
+const struct powf_log2_data __powf_log2_data = {
+  .tab = {
+  { 0x1.661ec79f8f3bep+0, -0x1.efec65b963019p-2 * POWF_SCALE },
+  { 0x1.571ed4aaf883dp+0, -0x1.b0b6832d4fca4p-2 * POWF_SCALE },
+  { 0x1.49539f0f010bp+0, -0x1.7418b0a1fb77bp-2 * POWF_SCALE },
+  { 0x1.3c995b0b80385p+0, -0x1.39de91a6dcf7bp-2 * POWF_SCALE },
+  { 0x1.30d190c8864a5p+0, -0x1.01d9bf3f2b631p-2 * POWF_SCALE },
+  { 0x1.25e227b0b8eap+0, -0x1.97c1d1b3b7afp-3 * POWF_SCALE },
+  { 0x1.1bb4a4a1a343fp+0, -0x1.2f9e393af3c9fp-3 * POWF_SCALE },
+  { 0x1.12358f08ae5bap+0, -0x1.960cbbf788d5cp-4 * POWF_SCALE },
+  { 0x1.0953f419900a7p+0, -0x1.a6f9db6475fcep-5 * POWF_SCALE },
+  { 0x1p+0, 0x0p+0 * POWF_SCALE },
+  { 0x1.e608cfd9a47acp-1, 0x1.338ca9f24f53dp-4 * POWF_SCALE },
+  { 0x1.ca4b31f026aap-1, 0x1.476a9543891bap-3 * POWF_SCALE },
+  { 0x1.b2036576afce6p-1, 0x1.e840b4ac4e4d2p-3 * POWF_SCALE },
+  { 0x1.9c2d163a1aa2dp-1, 0x1.40645f0c6651cp-2 * POWF_SCALE },
+  { 0x1.886e6037841edp-1, 0x1.88e9c2c1b9ff8p-2 * POWF_SCALE },
+  { 0x1.767dcf5534862p-1, 0x1.ce0a44eb17bccp-2 * POWF_SCALE },
+  },
+  .poly = {
+  0x1.27616c9496e0bp-2 * POWF_SCALE, -0x1.71969a075c67ap-2 * POWF_SCALE,
+  0x1.ec70a6ca7baddp-2 * POWF_SCALE, -0x1.7154748bef6c8p-1 * POWF_SCALE,
+  0x1.71547652ab82bp0 * POWF_SCALE,
+  }
+};
--- a/base/glibc-compatibility/musl/powf_data.h
+++ b/base/glibc-compatibility/musl/powf_data.h
@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2017-2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#ifndef _POWF_DATA_H
+#define _POWF_DATA_H
+
+#include "libm.h"
+#include "exp2f_data.h"
+
+#define POWF_LOG2_TABLE_BITS 4
+#define POWF_LOG2_POLY_ORDER 5
+#if TOINT_INTRINSICS
+#define POWF_SCALE_BITS EXP2F_TABLE_BITS
+#else
+#define POWF_SCALE_BITS 0
+#endif
+#define POWF_SCALE ((double)(1 << POWF_SCALE_BITS))
+extern hidden const struct powf_log2_data {
+	struct {
+		double invc, logc;
+	} tab[1 << POWF_LOG2_TABLE_BITS];
+	double poly[POWF_LOG2_POLY_ORDER];
+} __powf_log2_data;
+
+#endif
--- a/base/glibc-compatibility/musl/powl.c
+++ b/base/glibc-compatibility/musl/powl.c
@ -0,0 +1,525 @@
+/* origin: OpenBSD /usr/src/lib/libm/src/ld80/e_powl.c */
+/*
+ * Copyright (c) 2008 Stephen L. Moshier <steve@moshier.net>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/*                                                      powl.c
+ *
+ *      Power function, long double precision
+ *
+ *
+ * SYNOPSIS:
+ *
+ * long double x, y, z, powl();
+ *
+ * z = powl( x, y );
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Computes x raised to the yth power.  Analytically,
+ *
+ *      x**y  =  exp( y log(x) ).
+ *
+ * Following Cody and Waite, this program uses a lookup table
+ * of 2**-i/32 and pseudo extended precision arithmetic to
+ * obtain several extra bits of accuracy in both the logarithm
+ * and the exponential.
+ *
+ *
+ * ACCURACY:
+ *
+ * The relative error of pow(x,y) can be estimated
+ * by   y dl ln(2),   where dl is the absolute error of
+ * the internally computed base 2 logarithm.  At the ends
+ * of the approximation interval the logarithm equal 1/32
+ * and its relative error is about 1 lsb = 1.1e-19.  Hence
+ * the predicted relative error in the result is 2.3e-21 y .
+ *
+ *                      Relative error:
+ * arithmetic   domain     # trials      peak         rms
+ *
+ *    IEEE     +-1000       40000      2.8e-18      3.7e-19
+ * .001 < x < 1000, with log(x) uniformly distributed.
+ * -1000 < y < 1000, y uniformly distributed.
+ *
+ *    IEEE     0,8700       60000      6.5e-18      1.0e-18
+ * 0.99 < x < 1.01, 0 < y < 8700, uniformly distributed.
+ *
+ *
+ * ERROR MESSAGES:
+ *
+ *   message         condition      value returned
+ * pow overflow     x**y > MAXNUM      INFINITY
+ * pow underflow   x**y < 1/MAXNUM       0.0
+ * pow domain      x<0 and y noninteger  0.0
+ *
+ */
+
+#include "libm.h"
+
+#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
+long double powl(long double x, long double y)
+{
+	return pow(x, y);
+}
+#elif LDBL_MANT_DIG == 64 && LDBL_MAX_EXP == 16384
+
+/* Table size */
+#define NXT 32
+
+/* log(1+x) =  x - .5x^2 + x^3 *  P(z)/Q(z)
+ * on the domain  2^(-1/32) - 1  <=  x  <=  2^(1/32) - 1
+ */
+static const long double P[] = {
+ 8.3319510773868690346226E-4L,
+ 4.9000050881978028599627E-1L,
+ 1.7500123722550302671919E0L,
+ 1.4000100839971580279335E0L,
+};
+static const long double Q[] = {
+/* 1.0000000000000000000000E0L,*/
+ 5.2500282295834889175431E0L,
+ 8.4000598057587009834666E0L,
+ 4.2000302519914740834728E0L,
+};
+/* A[i] = 2^(-i/32), rounded to IEEE long double precision.
+ * If i is even, A[i] + B[i/2] gives additional accuracy.
+ */
+static const long double A[33] = {
+ 1.0000000000000000000000E0L,
+ 9.7857206208770013448287E-1L,
+ 9.5760328069857364691013E-1L,
+ 9.3708381705514995065011E-1L,
+ 9.1700404320467123175367E-1L,
+ 8.9735453750155359320742E-1L,
+ 8.7812608018664974155474E-1L,
+ 8.5930964906123895780165E-1L,
+ 8.4089641525371454301892E-1L,
+ 8.2287773907698242225554E-1L,
+ 8.0524516597462715409607E-1L,
+ 7.8799042255394324325455E-1L,
+ 7.7110541270397041179298E-1L,
+ 7.5458221379671136985669E-1L,
+ 7.3841307296974965571198E-1L,
+ 7.2259040348852331001267E-1L,
+ 7.0710678118654752438189E-1L,
+ 6.9195494098191597746178E-1L,
+ 6.7712777346844636413344E-1L,
+ 6.6261832157987064729696E-1L,
+ 6.4841977732550483296079E-1L,
+ 6.3452547859586661129850E-1L,
+ 6.2092890603674202431705E-1L,
+ 6.0762367999023443907803E-1L,
+ 5.9460355750136053334378E-1L,
+ 5.8186242938878875689693E-1L,
+ 5.6939431737834582684856E-1L,
+ 5.5719337129794626814472E-1L,
+ 5.4525386633262882960438E-1L,
+ 5.3357020033841180906486E-1L,
+ 5.2213689121370692017331E-1L,
+ 5.1094857432705833910408E-1L,
+ 5.0000000000000000000000E-1L,
+};
+static const long double B[17] = {
+ 0.0000000000000000000000E0L,
+ 2.6176170809902549338711E-20L,
+-1.0126791927256478897086E-20L,
+ 1.3438228172316276937655E-21L,
+ 1.2207982955417546912101E-20L,
+-6.3084814358060867200133E-21L,
+ 1.3164426894366316434230E-20L,
+-1.8527916071632873716786E-20L,
+ 1.8950325588932570796551E-20L,
+ 1.5564775779538780478155E-20L,
+ 6.0859793637556860974380E-21L,
+-2.0208749253662532228949E-20L,
+ 1.4966292219224761844552E-20L,
+ 3.3540909728056476875639E-21L,
+-8.6987564101742849540743E-22L,
+-1.2327176863327626135542E-20L,
+ 0.0000000000000000000000E0L,
+};
+
+/* 2^x = 1 + x P(x),
+ * on the interval -1/32 <= x <= 0
+ */
+static const long double R[] = {
+ 1.5089970579127659901157E-5L,
+ 1.5402715328927013076125E-4L,
+ 1.3333556028915671091390E-3L,
+ 9.6181291046036762031786E-3L,
+ 5.5504108664798463044015E-2L,
+ 2.4022650695910062854352E-1L,
+ 6.9314718055994530931447E-1L,
+};
+
+#define MEXP (NXT*16384.0L)
+/* The following if denormal numbers are supported, else -MEXP: */
+#define MNEXP (-NXT*(16384.0L+64.0L))
+/* log2(e) - 1 */
+#define LOG2EA 0.44269504088896340735992L
+
+#define F W
+#define Fa Wa
+#define Fb Wb
+#define G W
+#define Ga Wa
+#define Gb u
+#define H W
+#define Ha Wb
+#define Hb Wb
+
+static const long double MAXLOGL = 1.1356523406294143949492E4L;
+static const long double MINLOGL = -1.13994985314888605586758E4L;
+static const long double LOGE2L = 6.9314718055994530941723E-1L;
+static const long double huge = 0x1p10000L;
+/* XXX Prevent gcc from erroneously constant folding this. */
+static const volatile long double twom10000 = 0x1p-10000L;
+
+static long double reducl(long double);
+static long double powil(long double, int);
+
+long double __polevll(long double x, const long double *P, int n);
+long double __p1evll(long double x, const long double *P, int n);
+
+long double powl(long double x, long double y)
+{
+	/* double F, Fa, Fb, G, Ga, Gb, H, Ha, Hb */
+	int i, nflg, iyflg, yoddint;
+	long e;
+	volatile long double z=0;
+	long double w=0, W=0, Wa=0, Wb=0, ya=0, yb=0, u=0;
+
+	/* make sure no invalid exception is raised by nan comparision */
+	if (isnan(x)) {
+		if (!isnan(y) && y == 0.0)
+			return 1.0;
+		return x;
+	}
+	if (isnan(y)) {
+		if (x == 1.0)
+			return 1.0;
+		return y;
+	}
+	if (x == 1.0)
+		return 1.0; /* 1**y = 1, even if y is nan */
+	if (x == -1.0 && !isfinite(y))
+		return 1.0; /* -1**inf = 1 */
+	if (y == 0.0)
+		return 1.0; /* x**0 = 1, even if x is nan */
+	if (y == 1.0)
+		return x;
+	if (y >= LDBL_MAX) {
+		if (x > 1.0 || x < -1.0)
+			return INFINITY;
+		if (x != 0.0)
+			return 0.0;
+	}
+	if (y <= -LDBL_MAX) {
+		if (x > 1.0 || x < -1.0)
+			return 0.0;
+		if (x != 0.0 || y == -INFINITY)
+			return INFINITY;
+	}
+	if (x >= LDBL_MAX) {
+		if (y > 0.0)
+			return INFINITY;
+		return 0.0;
+	}
+
+	w = floorl(y);
+
+	/* Set iyflg to 1 if y is an integer. */
+	iyflg = 0;
+	if (w == y)
+		iyflg = 1;
+
+	/* Test for odd integer y. */
+	yoddint = 0;
+	if (iyflg) {
+		ya = fabsl(y);
+		ya = floorl(0.5 * ya);
+		yb = 0.5 * fabsl(w);
+		if( ya != yb )
+			yoddint = 1;
+	}
+
+	if (x <= -LDBL_MAX) {
+		if (y > 0.0) {
+			if (yoddint)
+				return -INFINITY;
+			return INFINITY;
+		}
+		if (y < 0.0) {
+			if (yoddint)
+				return -0.0;
+			return 0.0;
+		}
+	}
+	nflg = 0; /* (x<0)**(odd int) */
+	if (x <= 0.0) {
+		if (x == 0.0) {
+			if (y < 0.0) {
+				if (signbit(x) && yoddint)
+					/* (-0.0)**(-odd int) = -inf, divbyzero */
+					return -1.0/0.0;
+				/* (+-0.0)**(negative) = inf, divbyzero */
+				return 1.0/0.0;
+			}
+			if (signbit(x) && yoddint)
+				return -0.0;
+			return 0.0;
+		}
+		if (iyflg == 0)
+			return (x - x) / (x - x); /* (x<0)**(non-int) is NaN */
+		/* (x<0)**(integer) */
+		if (yoddint)
+			nflg = 1; /* negate result */
+		x = -x;
+	}
+	/* (+integer)**(integer)  */
+	if (iyflg && floorl(x) == x && fabsl(y) < 32768.0) {
+		w = powil(x, (int)y);
+		return nflg ? -w : w;
+	}
+
+	/* separate significand from exponent */
+	x = frexpl(x, &i);
+	e = i;
+
+	/* find significand in antilog table A[] */
+	i = 1;
+	if (x <= A[17])
+		i = 17;
+	if (x <= A[i+8])
+		i += 8;
+	if (x <= A[i+4])
+		i += 4;
+	if (x <= A[i+2])
+		i += 2;
+	if (x >= A[1])
+		i = -1;
+	i += 1;
+
+	/* Find (x - A[i])/A[i]
+	 * in order to compute log(x/A[i]):
+	 *
+	 * log(x) = log( a x/a ) = log(a) + log(x/a)
+	 *
+	 * log(x/a) = log(1+v),  v = x/a - 1 = (x-a)/a
+	 */
+	x -= A[i];
+	x -= B[i/2];
+	x /= A[i];
+
+	/* rational approximation for log(1+v):
+	 *
+	 * log(1+v)  =  v  -  v**2/2  +  v**3 P(v) / Q(v)
+	 */
+	z = x*x;
+	w = x * (z * __polevll(x, P, 3) / __p1evll(x, Q, 3));
+	w = w - 0.5*z;
+
+	/* Convert to base 2 logarithm:
+	 * multiply by log2(e) = 1 + LOG2EA
+	 */
+	z = LOG2EA * w;
+	z += w;
+	z += LOG2EA * x;
+	z += x;
+
+	/* Compute exponent term of the base 2 logarithm. */
+	w = -i;
+	w /= NXT;
+	w += e;
+	/* Now base 2 log of x is w + z. */
+
+	/* Multiply base 2 log by y, in extended precision. */
+
+	/* separate y into large part ya
+	 * and small part yb less than 1/NXT
+	 */
+	ya = reducl(y);
+	yb = y - ya;
+
+	/* (w+z)(ya+yb)
+	 * = w*ya + w*yb + z*y
+	 */
+	F = z * y  +  w * yb;
+	Fa = reducl(F);
+	Fb = F - Fa;
+
+	G = Fa + w * ya;
+	Ga = reducl(G);
+	Gb = G - Ga;
+
+	H = Fb + Gb;
+	Ha = reducl(H);
+	w = (Ga + Ha) * NXT;
+
+	/* Test the power of 2 for overflow */
+	if (w > MEXP)
+		return huge * huge;  /* overflow */
+	if (w < MNEXP)
+		return twom10000 * twom10000;  /* underflow */
+
+	e = w;
+	Hb = H - Ha;
+
+	if (Hb > 0.0) {
+		e += 1;
+		Hb -= 1.0/NXT;  /*0.0625L;*/
+	}
+
+	/* Now the product y * log2(x)  =  Hb + e/NXT.
+	 *
+	 * Compute base 2 exponential of Hb,
+	 * where -0.0625 <= Hb <= 0.
+	 */
+	z = Hb * __polevll(Hb, R, 6);  /*  z = 2**Hb - 1  */
+
+	/* Express e/NXT as an integer plus a negative number of (1/NXT)ths.
+	 * Find lookup table entry for the fractional power of 2.
+	 */
+	if (e < 0)
+		i = 0;
+	else
+		i = 1;
+	i = e/NXT + i;
+	e = NXT*i - e;
+	w = A[e];
+	z = w * z;  /*  2**-e * ( 1 + (2**Hb-1) )  */
+	z = z + w;
+	z = scalbnl(z, i);  /* multiply by integer power of 2 */
+
+	if (nflg)
+		z = -z;
+	return z;
+}
+
+
+/* Find a multiple of 1/NXT that is within 1/NXT of x. */
+static long double reducl(long double x)
+{
+	long double t;
+
+	t = x * NXT;
+	t = floorl(t);
+	t = t / NXT;
+	return t;
+}
+
+/*
+ *      Positive real raised to integer power, long double precision
+ *
+ *
+ * SYNOPSIS:
+ *
+ * long double x, y, powil();
+ * int n;
+ *
+ * y = powil( x, n );
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns argument x>0 raised to the nth power.
+ * The routine efficiently decomposes n as a sum of powers of
+ * two. The desired power is a product of two-to-the-kth
+ * powers of x.  Thus to compute the 32767 power of x requires
+ * 28 multiplications instead of 32767 multiplications.
+ *
+ *
+ * ACCURACY:
+ *
+ *                      Relative error:
+ * arithmetic   x domain   n domain  # trials      peak         rms
+ *    IEEE     .001,1000  -1022,1023  50000       4.3e-17     7.8e-18
+ *    IEEE        1,2     -1022,1023  20000       3.9e-17     7.6e-18
+ *    IEEE     .99,1.01     0,8700    10000       3.6e-16     7.2e-17
+ *
+ * Returns MAXNUM on overflow, zero on underflow.
+ */
+
+static long double powil(long double x, int nn)
+{
+	long double ww, y;
+	long double s;
+	int n, e, sign, lx;
+
+	if (nn == 0)
+		return 1.0;
+
+	if (nn < 0) {
+		sign = -1;
+		n = -nn;
+	} else {
+		sign = 1;
+		n = nn;
+	}
+
+	/* Overflow detection */
+
+	/* Calculate approximate logarithm of answer */
+	s = x;
+	s = frexpl( s, &lx);
+	e = (lx - 1)*n;
+	if ((e == 0) || (e > 64) || (e < -64)) {
+		s = (s - 7.0710678118654752e-1L) / (s +  7.0710678118654752e-1L);
+		s = (2.9142135623730950L * s - 0.5 + lx) * nn * LOGE2L;
+	} else {
+		s = LOGE2L * e;
+	}
+
+	if (s > MAXLOGL)
+		return huge * huge;  /* overflow */
+
+	if (s < MINLOGL)
+		return twom10000 * twom10000;  /* underflow */
+	/* Handle tiny denormal answer, but with less accuracy
+	 * since roundoff error in 1.0/x will be amplified.
+	 * The precise demarcation should be the gradual underflow threshold.
+	 */
+	if (s < -MAXLOGL+2.0) {
+		x = 1.0/x;
+		sign = -sign;
+	}
+
+	/* First bit of the power */
+	if (n & 1)
+		y = x;
+	else
+		y = 1.0;
+
+	ww = x;
+	n >>= 1;
+	while (n) {
+		ww = ww * ww;   /* arg to the 2-to-the-kth power */
+		if (n & 1)     /* if that bit is set, then include in product */
+			y *= ww;
+		n >>= 1;
+	}
+
+	if (sign < 0)
+		y = 1.0/y;
+	return y;
+}
+#elif LDBL_MANT_DIG == 113 && LDBL_MAX_EXP == 16384
+// TODO: broken implementation to make things compile
+long double powl(long double x, long double y)
+{
+	return pow(x, y);
+}
+#endif
--- a/base/glibc-compatibility/musl/timerfd.c
+++ b/base/glibc-compatibility/musl/timerfd.c
@ -0,0 +1,17 @@
+#include <sys/timerfd.h>
+#include "syscall.h"
+
+int timerfd_create(int clockid, int flags)
+{
+    return syscall(SYS_timerfd_create, clockid, flags);
+}
+
+int timerfd_settime(int fd, int flags, const struct itimerspec *new, struct itimerspec *old)
+{
+    return syscall(SYS_timerfd_settime, fd, flags, new, old);
+}
+
+int timerfd_gettime(int fd, struct itimerspec *cur)
+{
+    return syscall(SYS_timerfd_gettime, fd, cur);
+}
--- a/cmake/find/parquet.cmake
+++ b/cmake/find/parquet.cmake
@ -141,11 +141,6 @@ if(NOT EXTERNAL_PARQUET_FOUND AND NOT MISSING_INTERNAL_PARQUET_LIBRARY AND NOT O
    else()
    set(USE_INTERNAL_PARQUET_LIBRARY 1)

-    if(USE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE)
-        set(ARROW_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src")
-        set(PARQUET_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src" ${ClickHouse_BINARY_DIR}/contrib/arrow/cpp/src)
-    endif()
-
    if(MAKE_STATIC_LIBRARIES)
        set(FLATBUFFERS_LIBRARY flatbuffers)
        set(ARROW_LIBRARY arrow_static)
@ -155,9 +150,6 @@ if(NOT EXTERNAL_PARQUET_FOUND AND NOT MISSING_INTERNAL_PARQUET_LIBRARY AND NOT O
        set(FLATBUFFERS_LIBRARY flatbuffers_shared)
        set(ARROW_LIBRARY arrow_shared)
        set(PARQUET_LIBRARY parquet_shared)
-        if(USE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE)
-            list(APPEND PARQUET_LIBRARY boost::regex)
-        endif()
        set(THRIFT_LIBRARY thrift)
    endif()

--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@ -163,51 +163,21 @@ if(USE_INTERNAL_SNAPPY_LIBRARY)
 endif()

 if (USE_INTERNAL_PARQUET_LIBRARY)
-if (USE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE)
    # We dont use arrow's cmakefiles because they uses too many depends and download some libs in compile time
-    # But this mode can be used for updating auto-generated parquet files:
-    # cmake -DUSE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE=1 -DUSE_STATIC_LIBRARIES=0
-    # copy {BUILD_DIR}/contrib/arrow/cpp/src/parquet/*.cpp,*.h -> /contrib/arrow-cmake/cpp/src/parquet/
+    # But you can update auto-generated parquet files manually:
+    # cd {BUILD_DIR}/contrib/arrow/cpp/src/parquet && mkdir -p build && cd build
+    # cmake .. -DARROW_COMPUTE=ON -DARROW_PARQUET=ON -DARROW_SIMD_LEVEL=NONE -DARROW_VERBOSE_THIRDPARTY_BUILD=ON
+    #          -DARROW_BUILD_SHARED=1 -DARROW_BUILD_UTILITIES=OFF -DARROW_BUILD_INTEGRATION=OFF
+    #          -DBoost_FOUND=1 -DARROW_TEST_LINKAGE="shared"
+    # make -j8
+    # copy {BUILD_DIR}/contrib/arrow/cpp/src/parquet/*.cpp,*.h -> {BUILD_DIR}/contrib/arrow-cmake/cpp/src/parquet/

    # Also useful parquet reader:
-    # cd contrib/arrow/cpp/build && mkdir -p build && cmake .. -DPARQUET_BUILD_EXECUTABLES=1 && make -j8
-    # contrib/arrow/cpp/build/debug/parquet-reader some_file.parquet
+    # cd {BUILD_DIR}/contrib/arrow/cpp && mkdir -p build && cd build
+    # cmake .. -DARROW_PARQUET=1 -DARROW_WITH_SNAPPY=1 -DPARQUET_BUILD_EXECUTABLES=1
+    # make -j8
+    # {BUILD_DIR}/contrib/arrow/cpp/build/release/parquet-reader some_file.parquet

-    set (ARROW_COMPUTE ON CACHE INTERNAL "")
-    set (ARROW_PARQUET ON CACHE INTERNAL "")
-    set (ARROW_VERBOSE_THIRDPARTY_BUILD ON CACHE INTERNAL "")
-    set (ARROW_BUILD_SHARED 1 CACHE INTERNAL "")
-    set (ARROW_BUILD_UTILITIES OFF CACHE INTERNAL "")
-    set (ARROW_BUILD_INTEGRATION OFF CACHE INTERNAL "")
-    set (ARROW_BOOST_HEADER_ONLY ON CACHE INTERNAL "")
-    set (Boost_FOUND 1 CACHE INTERNAL "")
-    if (MAKE_STATIC_LIBRARIES)
-        set (PARQUET_ARROW_LINKAGE "static" CACHE INTERNAL "")
-        set (ARROW_TEST_LINKAGE "static" CACHE INTERNAL "")
-        set (ARROW_BUILD_STATIC ${MAKE_STATIC_LIBRARIES} CACHE INTERNAL "")
-    else ()
-        set (PARQUET_ARROW_LINKAGE "shared" CACHE INTERNAL "")
-        set (ARROW_TEST_LINKAGE "shared" CACHE INTERNAL "")
-    endif ()
-
-    if (CMAKE_BUILD_TYPE_UC STREQUAL "RELWITHDEBINFO")
-        set (_save_build_type ${CMAKE_BUILD_TYPE})
-        set (CMAKE_BUILD_TYPE Release)
-        string (TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
-    endif ()
-
-    # Because Arrow uses CMAKE_SOURCE_DIR as a project path
-    # Hopefully will be fixed in https://github.com/apache/arrow/pull/2676
-    set (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/cmake_modules")
-    add_subdirectory (arrow/cpp)
-
-    if (_save_build_type)
-        set (CMAKE_BUILD_TYPE ${_save_build_type})
-        unset (_save_build_type)
-        string (TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
-    endif ()
-
-else()
    add_subdirectory(arrow-cmake)

    # The library is large - avoid bloat.
@ -215,7 +185,6 @@ else()
    target_compile_options (${THRIFT_LIBRARY} PRIVATE -g0)
    target_compile_options (${PARQUET_LIBRARY} PRIVATE -g0)
 endif()
-endif()

 if (USE_INTERNAL_AVRO_LIBRARY)
    add_subdirectory(avro-cmake)
--- a/contrib/arrow
+++ b/contrib/arrow
@ -1 +1 @@
-Subproject commit 3cbcb7b62c2f2d02851bff837758637eb592a64b
+Subproject commit 744bdfe188f018e5e05f5deebd4e9ee0a7706cf4
--- a/contrib/arrow-cmake/CMakeLists.txt
+++ b/contrib/arrow-cmake/CMakeLists.txt
@ -144,15 +144,16 @@ set(ORC_SRCS

 set(LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src/arrow)

-configure_file("${LIBRARY_DIR}/util/config.h.cmake" "${CMAKE_CURRENT_SOURCE_DIR}/cpp/src/arrow/util/config.h")
+configure_file("${LIBRARY_DIR}/util/config.h.cmake" "${CMAKE_CURRENT_BINARY_DIR}/cpp/src/arrow/util/config.h")

 # arrow/cpp/src/arrow/CMakeLists.txt
 set(ARROW_SRCS
-        ${LIBRARY_DIR}/array.cc
        ${LIBRARY_DIR}/buffer.cc
-        ${LIBRARY_DIR}/device.cc
        ${LIBRARY_DIR}/builder.cc
+        ${LIBRARY_DIR}/chunked_array.cc
        ${LIBRARY_DIR}/compare.cc
+        ${LIBRARY_DIR}/datum.cc
+        ${LIBRARY_DIR}/device.cc
        ${LIBRARY_DIR}/extension_type.cc
        ${LIBRARY_DIR}/memory_pool.cc
        ${LIBRARY_DIR}/pretty_print.cc
@ -167,11 +168,12 @@ set(ARROW_SRCS
        ${LIBRARY_DIR}/type.cc
        ${LIBRARY_DIR}/visitor.cc

-        ${LIBRARY_DIR}/tensor/coo_converter.cc
-        ${LIBRARY_DIR}/tensor/csc_converter.cc
-        ${LIBRARY_DIR}/tensor/csf_converter.cc
-        ${LIBRARY_DIR}/tensor/csr_converter.cc
-
+        ${LIBRARY_DIR}/array/array_base.cc
+        ${LIBRARY_DIR}/array/array_binary.cc
+        ${LIBRARY_DIR}/array/array_decimal.cc
+        ${LIBRARY_DIR}/array/array_dict.cc
+        ${LIBRARY_DIR}/array/array_nested.cc
+        ${LIBRARY_DIR}/array/array_primitive.cc
        ${LIBRARY_DIR}/array/builder_adaptive.cc
        ${LIBRARY_DIR}/array/builder_base.cc
        ${LIBRARY_DIR}/array/builder_binary.cc
@ -181,17 +183,50 @@ set(ARROW_SRCS
        ${LIBRARY_DIR}/array/builder_primitive.cc
        ${LIBRARY_DIR}/array/builder_union.cc
        ${LIBRARY_DIR}/array/concatenate.cc
-        ${LIBRARY_DIR}/array/dict_internal.cc
+        ${LIBRARY_DIR}/array/data.cc
        ${LIBRARY_DIR}/array/diff.cc
+        ${LIBRARY_DIR}/array/util.cc
        ${LIBRARY_DIR}/array/validate.cc

-        ${LIBRARY_DIR}/csv/converter.cc
+        ${LIBRARY_DIR}/compute/api_scalar.cc
+        ${LIBRARY_DIR}/compute/api_vector.cc
+        ${LIBRARY_DIR}/compute/cast.cc
+        ${LIBRARY_DIR}/compute/exec.cc
+        ${LIBRARY_DIR}/compute/function.cc
+        ${LIBRARY_DIR}/compute/kernel.cc
+        ${LIBRARY_DIR}/compute/registry.cc
+
+        ${LIBRARY_DIR}/compute/kernels/aggregate_basic.cc
+        ${LIBRARY_DIR}/compute/kernels/aggregate_mode.cc
+        ${LIBRARY_DIR}/compute/kernels/aggregate_var_std.cc
+        ${LIBRARY_DIR}/compute/kernels/codegen_internal.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_arithmetic.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_boolean.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_cast_boolean.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_cast_internal.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_cast_nested.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_cast_numeric.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_cast_string.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_cast_temporal.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_compare.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_fill_null.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_nested.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_set_lookup.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_string.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_validity.cc
+        ${LIBRARY_DIR}/compute/kernels/vector_hash.cc
+        ${LIBRARY_DIR}/compute/kernels/vector_nested.cc
+        ${LIBRARY_DIR}/compute/kernels/vector_selection.cc
+        ${LIBRARY_DIR}/compute/kernels/vector_sort.cc
+        ${LIBRARY_DIR}/compute/kernels/util_internal.cc
+
        ${LIBRARY_DIR}/csv/chunker.cc
        ${LIBRARY_DIR}/csv/column_builder.cc
+        ${LIBRARY_DIR}/csv/column_decoder.cc
+        ${LIBRARY_DIR}/csv/converter.cc
        ${LIBRARY_DIR}/csv/options.cc
        ${LIBRARY_DIR}/csv/parser.cc
        ${LIBRARY_DIR}/csv/reader.cc
-        ${LIBRARY_DIR}/csv/column_decoder.cc

        ${LIBRARY_DIR}/ipc/dictionary.cc
        ${LIBRARY_DIR}/ipc/feather.cc
@ -202,14 +237,25 @@ set(ARROW_SRCS
        ${LIBRARY_DIR}/ipc/writer.cc

        ${LIBRARY_DIR}/io/buffered.cc
+        ${LIBRARY_DIR}/io/caching.cc
        ${LIBRARY_DIR}/io/compressed.cc
        ${LIBRARY_DIR}/io/file.cc
        ${LIBRARY_DIR}/io/interfaces.cc
        ${LIBRARY_DIR}/io/memory.cc
        ${LIBRARY_DIR}/io/slow.cc

+        ${LIBRARY_DIR}/tensor/coo_converter.cc
+        ${LIBRARY_DIR}/tensor/csf_converter.cc
+        ${LIBRARY_DIR}/tensor/csx_converter.cc
+
        ${LIBRARY_DIR}/util/basic_decimal.cc
+        ${LIBRARY_DIR}/util/bit_block_counter.cc
+        ${LIBRARY_DIR}/util/bit_run_reader.cc
        ${LIBRARY_DIR}/util/bit_util.cc
+        ${LIBRARY_DIR}/util/bitmap.cc
+        ${LIBRARY_DIR}/util/bitmap_builders.cc
+        ${LIBRARY_DIR}/util/bitmap_ops.cc
+        ${LIBRARY_DIR}/util/bpacking.cc
        ${LIBRARY_DIR}/util/compression.cc
        ${LIBRARY_DIR}/util/compression_lz4.cc
        ${LIBRARY_DIR}/util/compression_snappy.cc
@ -217,8 +263,12 @@ set(ARROW_SRCS
        ${LIBRARY_DIR}/util/compression_zstd.cc
        ${LIBRARY_DIR}/util/cpu_info.cc
        ${LIBRARY_DIR}/util/decimal.cc
+        ${LIBRARY_DIR}/util/delimiting.cc
+        ${LIBRARY_DIR}/util/formatting.cc
+        ${LIBRARY_DIR}/util/future.cc
        ${LIBRARY_DIR}/util/int_util.cc
        ${LIBRARY_DIR}/util/io_util.cc
+        ${LIBRARY_DIR}/util/iterator.cc
        ${LIBRARY_DIR}/util/key_value_metadata.cc
        ${LIBRARY_DIR}/util/logging.cc
        ${LIBRARY_DIR}/util/memory.cc
@ -226,27 +276,15 @@ set(ARROW_SRCS
        ${LIBRARY_DIR}/util/string.cc
        ${LIBRARY_DIR}/util/task_group.cc
        ${LIBRARY_DIR}/util/thread_pool.cc
+        ${LIBRARY_DIR}/util/time.cc
        ${LIBRARY_DIR}/util/trie.cc
        ${LIBRARY_DIR}/util/utf8.cc
-        ${LIBRARY_DIR}/util/future.cc
-        ${LIBRARY_DIR}/util/formatting.cc
-        ${LIBRARY_DIR}/util/parsing.cc
-        ${LIBRARY_DIR}/util/time.cc
-        ${LIBRARY_DIR}/util/delimiting.cc
-        ${LIBRARY_DIR}/util/iterator.cc
+        ${LIBRARY_DIR}/util/value_parsing.cc

        ${LIBRARY_DIR}/vendored/base64.cpp
        ${ORC_SRCS}
        )

-set(ARROW_SRCS ${ARROW_SRCS}
-        ${LIBRARY_DIR}/compute/context.cc
-        ${LIBRARY_DIR}/compute/kernels/boolean.cc
-        ${LIBRARY_DIR}/compute/kernels/cast.cc
-        ${LIBRARY_DIR}/compute/kernels/hash.cc
-        ${LIBRARY_DIR}/compute/kernels/util_internal.cc
-        )
-
 if (SNAPPY_INCLUDE_DIR AND SNAPPY_LIBRARY)
    set(ARROW_WITH_SNAPPY 1)
 endif ()
@ -289,7 +327,8 @@ if (USE_INTERNAL_PROTOBUF_LIBRARY)
    add_dependencies(${ARROW_LIBRARY} protoc)
 endif ()

-target_include_directories(${ARROW_LIBRARY} SYSTEM PUBLIC ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/cpp/src)
+target_include_directories(${ARROW_LIBRARY} SYSTEM PUBLIC ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src)
+target_include_directories(${ARROW_LIBRARY} SYSTEM PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/cpp/src)
 target_link_libraries(${ARROW_LIBRARY} PRIVATE ${DOUBLE_CONVERSION_LIBRARIES} ${Protobuf_LIBRARY})
 target_link_libraries(${ARROW_LIBRARY} PRIVATE lz4)
 if (ARROW_WITH_SNAPPY)
@ -319,19 +358,26 @@ set(LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src/parquet)
 set(GEN_LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src/generated)
 # arrow/cpp/src/parquet/CMakeLists.txt
 set(PARQUET_SRCS
+        ${LIBRARY_DIR}/arrow/path_internal.cc
        ${LIBRARY_DIR}/arrow/reader.cc
        ${LIBRARY_DIR}/arrow/reader_internal.cc
        ${LIBRARY_DIR}/arrow/schema.cc
+        ${LIBRARY_DIR}/arrow/schema_internal.cc
        ${LIBRARY_DIR}/arrow/writer.cc
-        ${LIBRARY_DIR}/arrow/path_internal.cc
        ${LIBRARY_DIR}/bloom_filter.cc
        ${LIBRARY_DIR}/column_reader.cc
        ${LIBRARY_DIR}/column_scanner.cc
        ${LIBRARY_DIR}/column_writer.cc
        ${LIBRARY_DIR}/deprecated_io.cc
        ${LIBRARY_DIR}/encoding.cc
+        ${LIBRARY_DIR}/encryption.cc
+        ${LIBRARY_DIR}/encryption_internal.cc
        ${LIBRARY_DIR}/file_reader.cc
        ${LIBRARY_DIR}/file_writer.cc
+        ${LIBRARY_DIR}/internal_file_decryptor.cc
+        ${LIBRARY_DIR}/internal_file_encryptor.cc
+        ${LIBRARY_DIR}/level_conversion.cc
+        ${LIBRARY_DIR}/level_comparison.cc
        ${LIBRARY_DIR}/metadata.cc
        ${LIBRARY_DIR}/murmur3.cc
        ${LIBRARY_DIR}/platform.cc
@ -340,10 +386,6 @@ set(PARQUET_SRCS
        ${LIBRARY_DIR}/schema.cc
        ${LIBRARY_DIR}/statistics.cc
        ${LIBRARY_DIR}/types.cc
-        ${LIBRARY_DIR}/encryption.cc
-        ${LIBRARY_DIR}/encryption_internal.cc
-        ${LIBRARY_DIR}/internal_file_decryptor.cc
-        ${LIBRARY_DIR}/internal_file_encryptor.cc

        ${GEN_LIBRARY_DIR}/parquet_constants.cpp
        ${GEN_LIBRARY_DIR}/parquet_types.cpp
--- a/contrib/arrow-cmake/cpp/src/arrow/util/config.h
+++ b/contrib/arrow-cmake/cpp/src/arrow/util/config.h
@ -1,26 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#define ARROW_VERSION_MAJOR 
-#define ARROW_VERSION_MINOR 
-#define ARROW_VERSION_PATCH 
-#define ARROW_VERSION ((ARROW_VERSION_MAJOR * 1000) + ARROW_VERSION_MINOR) * 1000 + ARROW_VERSION_PATCH
-
-#define ARROW_SO_VERSION ""
-#define ARROW_FULL_SO_VERSION ""
-
-/* #undef GRPCPP_PP_INCLUDE */
--- a/contrib/arrow-cmake/cpp/src/parquet/parquet_version.h
+++ b/contrib/arrow-cmake/cpp/src/parquet/parquet_version.h
@ -22,8 +22,8 @@
 #define PARQUET_VERSION_MINOR 5
 #define PARQUET_VERSION_PATCH 1

-#define PARQUET_SO_VERSION 0
-#define PARQUET_FULL_SO_VERSION 0.17
+#define PARQUET_SO_VERSION "200"
+#define PARQUET_FULL_SO_VERSION "200.0.0"

 // define the parquet created by version
 #define CREATED_BY_VERSION "parquet-cpp version 1.5.1-SNAPSHOT"
--- a/contrib/boost-cmake/CMakeLists.txt
+++ b/contrib/boost-cmake/CMakeLists.txt
@ -11,10 +11,11 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
        iostreams
        program_options
        regex
+        context
    )

    if(Boost_INCLUDE_DIR AND Boost_FILESYSTEM_LIBRARY AND Boost_FILESYSTEM_LIBRARY AND
-       Boost_PROGRAM_OPTIONS_LIBRARY AND Boost_REGEX_LIBRARY AND Boost_SYSTEM_LIBRARY)
+       Boost_PROGRAM_OPTIONS_LIBRARY AND Boost_REGEX_LIBRARY AND Boost_SYSTEM_LIBRARY AND Boost_CONTEXT_LIBRARY)

        set(EXTERNAL_BOOST_FOUND 1)

@ -27,18 +28,21 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
        add_library (_boost_program_options INTERFACE)
        add_library (_boost_regex INTERFACE)
        add_library (_boost_system INTERFACE)
+        add_library (_boost_context INTERFACE)

        target_link_libraries (_boost_filesystem INTERFACE ${Boost_FILESYSTEM_LIBRARY})
        target_link_libraries (_boost_iostreams INTERFACE ${Boost_IOSTREAMS_LIBRARY})
        target_link_libraries (_boost_program_options INTERFACE ${Boost_PROGRAM_OPTIONS_LIBRARY})
        target_link_libraries (_boost_regex INTERFACE ${Boost_REGEX_LIBRARY})
        target_link_libraries (_boost_system INTERFACE ${Boost_SYSTEM_LIBRARY})
+        target_link_libraries (_boost_context INTERFACE ${Boost_CONTEXT_LIBRARY})

        add_library (boost::filesystem ALIAS _boost_filesystem)
        add_library (boost::iostreams ALIAS _boost_iostreams)
        add_library (boost::program_options ALIAS _boost_program_options)
        add_library (boost::regex ALIAS _boost_regex)
        add_library (boost::system ALIAS _boost_system)
+        add_library (boost::context ALIAS _boost_context)
    else()
        set(EXTERNAL_BOOST_FOUND 0)
        message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system boost")
@ -142,4 +146,57 @@ if (NOT EXTERNAL_BOOST_FOUND)
    add_library (_boost_system ${SRCS_SYSTEM})
    add_library (boost::system ALIAS _boost_system)
    target_include_directories (_boost_system PRIVATE ${LIBRARY_DIR})
+
+    # context
+    enable_language(ASM)
+    SET(ASM_OPTIONS "-x assembler-with-cpp")
+
+    if (SANITIZE AND (SANITIZE STREQUAL "address" OR SANITIZE STREQUAL "thread"))
+        add_compile_definitions(BOOST_USE_UCONTEXT)
+
+        if (SANITIZE STREQUAL "address")
+            add_compile_definitions(BOOST_USE_ASAN)
+        elseif (SANITIZE STREQUAL "thread")
+            add_compile_definitions(BOOST_USE_TSAN)
+        endif()
+
+        set (SRCS_CONTEXT
+                ${LIBRARY_DIR}/libs/context/src/fiber.cpp
+                ${LIBRARY_DIR}/libs/context/src/continuation.cpp
+                ${LIBRARY_DIR}/libs/context/src/dummy.cpp
+                ${LIBRARY_DIR}/libs/context/src/execution_context.cpp
+                ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp
+        )
+    elseif (ARCH_ARM)
+        set (SRCS_CONTEXT
+            ${LIBRARY_DIR}/libs/context/src/asm/jump_arm64_aapcs_elf_gas.S
+            ${LIBRARY_DIR}/libs/context/src/asm/make_arm64_aapcs_elf_gas.S
+            ${LIBRARY_DIR}/libs/context/src/asm/ontop_arm64_aapcs_elf_gas.S
+            ${LIBRARY_DIR}/libs/context/src/dummy.cpp
+            ${LIBRARY_DIR}/libs/context/src/execution_context.cpp
+            ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp
+        )
+    elseif(OS_DARWIN)
+        set (SRCS_CONTEXT
+            ${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_sysv_macho_gas.S
+            ${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_sysv_macho_gas.S
+            ${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_sysv_macho_gas.S
+            ${LIBRARY_DIR}/libs/context/src/dummy.cpp
+            ${LIBRARY_DIR}/libs/context/src/execution_context.cpp
+            ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp
+        )
+    else()
+        set (SRCS_CONTEXT
+            ${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_sysv_elf_gas.S
+            ${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_sysv_elf_gas.S
+            ${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_sysv_elf_gas.S
+            ${LIBRARY_DIR}/libs/context/src/dummy.cpp
+            ${LIBRARY_DIR}/libs/context/src/execution_context.cpp
+            ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp
+        )
+    endif()
+
+    add_library (_boost_context ${SRCS_CONTEXT})
+    add_library (boost::context ALIAS _boost_context)
+    target_include_directories (_boost_context PRIVATE ${LIBRARY_DIR})
 endif ()
--- a/contrib/grpc-cmake/CMakeLists.txt
+++ b/contrib/grpc-cmake/CMakeLists.txt
@ -54,6 +54,26 @@ else ()
  set(CARES_SHARED ON CACHE BOOL "" FORCE)
 endif ()

+# Disable looking for libnsl on a platforms that has gethostbyname in glibc
+#
+# c-ares searching for gethostbyname in the libnsl library, however in the
+# version that shipped with gRPC it doing it wrong [1], since it uses
+# CHECK_LIBRARY_EXISTS(), which will return TRUE even if the function exists in
+# another dependent library. The upstream already contains correct macro [2],
+# but it is not included in gRPC (even upstream gRPC, not the one that is
+# shipped with clickhousee).
+#
+#   [1]: https://github.com/c-ares/c-ares/blob/e982924acee7f7313b4baa4ee5ec000c5e373c30/CMakeLists.txt#L125
+#   [2]: https://github.com/c-ares/c-ares/blob/44fbc813685a1fa8aa3f27fcd7544faf612d376a/CMakeLists.txt#L146
+#
+# And because if you by some reason have libnsl [3] installed, clickhouse will
+# reject to start w/o it. While this is completelly different library.
+#
+#   [3]: https://packages.debian.org/bullseye/libnsl2
+if (NOT CMAKE_SYSTEM_NAME STREQUAL "SunOS")
+  set(HAVE_LIBNSL OFF CACHE BOOL "" FORCE)
+endif()
+
 # We don't want to build C# extensions.
 set(gRPC_BUILD_CSHARP_EXT OFF)

--- a/contrib/jemalloc
+++ b/contrib/jemalloc
@ -1 +1 @@
-Subproject commit 93e27e435cac846028da20cd9b0841fbc9110bd2
+Subproject commit e6891d9746143bf2cf617493d880ba5a0b9a3efd
--- a/contrib/poco
+++ b/contrib/poco
@ -1 +1 @@
-Subproject commit 08974cc024b2e748f5b1d45415396706b3521d0f
+Subproject commit 2c32e17c7dfee1f8bf24227b697cdef5fddf0823
--- a/contrib/rocksdb-cmake/CMakeLists.txt
+++ b/contrib/rocksdb-cmake/CMakeLists.txt
@ -2,12 +2,6 @@
 set(ROCKSDB_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/rocksdb")
 list(APPEND CMAKE_MODULE_PATH "${ROCKSDB_SOURCE_DIR}/cmake/modules/")

-find_program(CCACHE_FOUND ccache)
-if(CCACHE_FOUND)
-  set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
-  set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache)
-endif(CCACHE_FOUND)
-
 if (SANITIZE STREQUAL "undefined")
    set(WITH_UBSAN ON)
 elseif (SANITIZE STREQUAL "address")
--- a/debian/clickhouse-test.install
+++ b/debian/clickhouse-test.install
@ -1,5 +1,2 @@
 usr/bin/clickhouse-test
-usr/bin/clickhouse-test-server
 usr/share/clickhouse-test/*
-etc/clickhouse-client/client-test.xml
-etc/clickhouse-server/server-test.xml
--- a/debian/rules
+++ b/debian/rules
@ -62,7 +62,7 @@ ifndef DISABLE_NINJA
    NINJA=$(shell which ninja)
 ifneq ($(NINJA),)
        CMAKE_FLAGS += -GNinja
-        export MAKE=$(NINJA)
+        export MAKE=$(NINJA) $(NINJA_FLAGS)
 endif
 endif

@ -93,7 +93,7 @@ override_dh_auto_build:

 override_dh_auto_test:
 ifeq (,$(filter nocheck,$(DEB_BUILD_OPTIONS)))
-	cd $(BUILDDIR) && ctest $(THREADS_COUNT) -V -E with_server
+	cd $(BUILDDIR) && ctest $(THREADS_COUNT) -V
 endif

 override_dh_clean:
--- a/docker/packager/unbundled/Dockerfile
+++ b/docker/packager/unbundled/Dockerfile
@ -21,6 +21,7 @@ RUN apt-get update \
        libboost-thread-dev \
        libboost-iostreams-dev \
        libboost-regex-dev \
+        libboost-context-dev \
        zlib1g-dev \
        liblz4-dev \
        libdouble-conversion-dev \
--- a/docker/server/alpine-build.sh
+++ b/docker/server/alpine-build.sh
@ -47,13 +47,13 @@ cp "${DOCKER_BUILD_FOLDER}/entrypoint.alpine.sh"      "${CONTAINER_ROOT_FOLDER}/
 ## get glibc components from ubuntu 20.04 and put them to expected place
 docker pull ubuntu:20.04
 ubuntu20image=$(docker create --rm ubuntu:20.04)
-docker cp -L ${ubuntu20image}:/lib/x86_64-linux-gnu/libc.so.6       "${CONTAINER_ROOT_FOLDER}/lib"
-docker cp -L ${ubuntu20image}:/lib/x86_64-linux-gnu/libdl.so.2      "${CONTAINER_ROOT_FOLDER}/lib"
-docker cp -L ${ubuntu20image}:/lib/x86_64-linux-gnu/libm.so.6       "${CONTAINER_ROOT_FOLDER}/lib"
-docker cp -L ${ubuntu20image}:/lib/x86_64-linux-gnu/libpthread.so.0 "${CONTAINER_ROOT_FOLDER}/lib"
-docker cp -L ${ubuntu20image}:/lib/x86_64-linux-gnu/librt.so.1      "${CONTAINER_ROOT_FOLDER}/lib"
-docker cp -L ${ubuntu20image}:/lib/x86_64-linux-gnu/libnss_dns.so.2 "${CONTAINER_ROOT_FOLDER}/lib"
-docker cp -L ${ubuntu20image}:/lib/x86_64-linux-gnu/libresolv.so.2  "${CONTAINER_ROOT_FOLDER}/lib"
-docker cp -L ${ubuntu20image}:/lib64/ld-linux-x86-64.so.2           "${CONTAINER_ROOT_FOLDER}/lib64"
+docker cp -L "${ubuntu20image}":/lib/x86_64-linux-gnu/libc.so.6       "${CONTAINER_ROOT_FOLDER}/lib"
+docker cp -L "${ubuntu20image}":/lib/x86_64-linux-gnu/libdl.so.2      "${CONTAINER_ROOT_FOLDER}/lib"
+docker cp -L "${ubuntu20image}":/lib/x86_64-linux-gnu/libm.so.6       "${CONTAINER_ROOT_FOLDER}/lib"
+docker cp -L "${ubuntu20image}":/lib/x86_64-linux-gnu/libpthread.so.0 "${CONTAINER_ROOT_FOLDER}/lib"
+docker cp -L "${ubuntu20image}":/lib/x86_64-linux-gnu/librt.so.1      "${CONTAINER_ROOT_FOLDER}/lib"
+docker cp -L "${ubuntu20image}":/lib/x86_64-linux-gnu/libnss_dns.so.2 "${CONTAINER_ROOT_FOLDER}/lib"
+docker cp -L "${ubuntu20image}":/lib/x86_64-linux-gnu/libresolv.so.2  "${CONTAINER_ROOT_FOLDER}/lib"
+docker cp -L "${ubuntu20image}":/lib64/ld-linux-x86-64.so.2           "${CONTAINER_ROOT_FOLDER}/lib64"

 docker build "$DOCKER_BUILD_FOLDER" -f Dockerfile.alpine -t "yandex/clickhouse-server:${VERSION}-alpine" --pull
--- a/docker/server/entrypoint.alpine.sh
+++ b/docker/server/entrypoint.alpine.sh
@ -26,17 +26,17 @@ fi
 CLICKHOUSE_CONFIG="${CLICKHOUSE_CONFIG:-/etc/clickhouse-server/config.xml}"

 # port is needed to check if clickhouse-server is ready for connections
-HTTP_PORT="$(clickhouse extract-from-config --config-file $CLICKHOUSE_CONFIG --key=http_port)"
+HTTP_PORT="$(clickhouse extract-from-config --config-file "${CLICKHOUSE_CONFIG}" --key=http_port)"

 # get CH directories locations
-DATA_DIR="$(clickhouse extract-from-config --config-file $CLICKHOUSE_CONFIG --key=path || true)"
-TMP_DIR="$(clickhouse extract-from-config --config-file $CLICKHOUSE_CONFIG --key=tmp_path || true)"
-USER_PATH="$(clickhouse extract-from-config --config-file $CLICKHOUSE_CONFIG --key=user_files_path || true)"
-LOG_PATH="$(clickhouse extract-from-config --config-file $CLICKHOUSE_CONFIG --key=logger.log || true)"
-LOG_DIR="$(dirname $LOG_PATH || true)"
-ERROR_LOG_PATH="$(clickhouse extract-from-config --config-file $CLICKHOUSE_CONFIG --key=logger.errorlog || true)"
-ERROR_LOG_DIR="$(dirname $ERROR_LOG_PATH || true)"
-FORMAT_SCHEMA_PATH="$(clickhouse extract-from-config --config-file $CLICKHOUSE_CONFIG --key=format_schema_path || true)"
+DATA_DIR="$(clickhouse extract-from-config --config-file "${CLICKHOUSE_CONFIG}" --key=path || true)"
+TMP_DIR="$(clickhouse extract-from-config --config-file "${CLICKHOUSE_CONFIG}" --key=tmp_path || true)"
+USER_PATH="$(clickhouse extract-from-config --config-file "${CLICKHOUSE_CONFIG}" --key=user_files_path || true)"
+LOG_PATH="$(clickhouse extract-from-config --config-file "${CLICKHOUSE_CONFIG}" --key=logger.log || true)"
+LOG_DIR="$(dirname "${LOG_PATH}" || true)"
+ERROR_LOG_PATH="$(clickhouse extract-from-config --config-file "${CLICKHOUSE_CONFIG}" --key=logger.errorlog || true)"
+ERROR_LOG_DIR="$(dirname "${ERROR_LOG_PATH}" || true)"
+FORMAT_SCHEMA_PATH="$(clickhouse extract-from-config --config-file "${CLICKHOUSE_CONFIG}" --key=format_schema_path || true)"

 CLICKHOUSE_USER="${CLICKHOUSE_USER:-default}"
 CLICKHOUSE_PASSWORD="${CLICKHOUSE_PASSWORD:-}"
@ -92,7 +92,7 @@ fi

 if [ -n "$(ls /docker-entrypoint-initdb.d/)" ] || [ -n "$CLICKHOUSE_DB" ]; then
    # Listen only on localhost until the initialization is done
-    $gosu /usr/bin/clickhouse-server --config-file=$CLICKHOUSE_CONFIG -- --listen_host=127.0.0.1 &
+    $gosu /usr/bin/clickhouse-server --config-file="${CLICKHOUSE_CONFIG}" -- --listen_host=127.0.0.1 &
    pid="$!"

    # check if clickhouse is ready to accept connections
@ -107,7 +107,7 @@ if [ -n "$(ls /docker-entrypoint-initdb.d/)" ] || [ -n "$CLICKHOUSE_DB" ]; then
        sleep 1
    done

-    if [ ! -z "$CLICKHOUSE_PASSWORD" ]; then
+    if [ -n "$CLICKHOUSE_PASSWORD" ]; then
        printf -v WITH_PASSWORD '%s %q' "--password" "$CLICKHOUSE_PASSWORD"
    fi

@ -130,7 +130,7 @@ if [ -n "$(ls /docker-entrypoint-initdb.d/)" ] || [ -n "$CLICKHOUSE_DB" ]; then
                    . "$f"
                fi
                ;;
-            *.sql)    echo "$0: running $f"; cat "$f" | "$clickhouseclient" ; echo ;;
+            *.sql)    echo "$0: running $f"; "$clickhouseclient" < "$f" ; echo ;;
            *.sql.gz) echo "$0: running $f"; gunzip -c "$f" | "$clickhouseclient"; echo ;;
            *)        echo "$0: ignoring $f" ;;
        esac
@ -145,7 +145,7 @@ fi

 # if no args passed to `docker run` or first argument start with `--`, then the user is passing clickhouse-server arguments
 if [[ $# -lt 1 ]] || [[ "$1" == "--"* ]]; then
-    exec $gosu /usr/bin/clickhouse-server --config-file=$CLICKHOUSE_CONFIG "$@"
+    exec $gosu /usr/bin/clickhouse-server --config-file="${CLICKHOUSE_CONFIG}" "$@"
 fi

 # Otherwise, we assume the user want to run his own process, for example a `bash` shell to explore this image
--- a/docker/test/stateless/run.sh
+++ b/docker/test/stateless/run.sh
@ -12,7 +12,32 @@ dpkg -i package_folder/clickhouse-test_*.deb
 # install test configs
 /usr/share/clickhouse-test/config/install.sh

-service clickhouse-server start && sleep 5
+# For flaky check we also enable thread fuzzer
+if [ "$NUM_TRIES" -gt "1" ]; then
+    export THREAD_FUZZER_CPU_TIME_PERIOD_US=1000
+    export THREAD_FUZZER_SLEEP_PROBABILITY=0.1
+    export THREAD_FUZZER_SLEEP_TIME_US=100000
+
+    export THREAD_FUZZER_pthread_mutex_lock_BEFORE_MIGRATE_PROBABILITY=1
+    export THREAD_FUZZER_pthread_mutex_lock_AFTER_MIGRATE_PROBABILITY=1
+    export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_MIGRATE_PROBABILITY=1
+    export THREAD_FUZZER_pthread_mutex_unlock_AFTER_MIGRATE_PROBABILITY=1
+
+    export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_PROBABILITY=0.001
+    export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_PROBABILITY=0.001
+    export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_PROBABILITY=0.001
+    export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_PROBABILITY=0.001
+    export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_TIME_US=10000
+    export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_TIME_US=10000
+    export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_TIME_US=10000
+    export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_TIME_US=10000
+
+    # simpliest way to forward env variables to server
+    sudo -E -u clickhouse /usr/bin/clickhouse-server --config /etc/clickhouse-server/config.xml --daemon
+    sleep 5
+else
+    service clickhouse-server start && sleep 5
+fi

 if grep -q -- "--use-skip-list" /usr/bin/clickhouse-test; then
    SKIP_LIST_OPT="--use-skip-list"
--- a/docs/_description_templates/template-server-setting.md
+++ b/docs/_description_templates/template-server-setting.md
@ -0,0 +1,33 @@
+## server_setting_name {#server_setting_name}
+
+Description.
+
+Describe what is configured in this section of settings.
+
+Possible value: ...
+
+Default value: ...
+
+Settings: (Optional)
+
+If the section contains several settings, list them here. Specify possible values and default values:
+
+-   setting_1 — Description.
+-   setting_2 — Description.
+
+**Example:**
+
+```xml
+<server_setting_name>
+    <setting_1> ... </setting_1>
+    <setting_2> ... </setting_2>
+</server_setting_name>
+```
+
+**Additional Info** (Optional)
+
+The name of an additional section can be any, for example, **Usage**.
+
+**See Also** (Optional)
+
+-   [link](#)
--- a/docs/en/getting-started/example-datasets/index.md
+++ b/docs/en/getting-started/example-datasets/index.md
@ -12,6 +12,7 @@ The list of documented datasets:

 -   [GitHub Events](../../getting-started/example-datasets/github-events.md)
 -   [Anonymized Yandex.Metrica Dataset](../../getting-started/example-datasets/metrica.md)
+-   [Recipes](../../getting-started/example-datasets/recipes.md)
 -   [Star Schema Benchmark](../../getting-started/example-datasets/star-schema.md)
 -   [WikiStat](../../getting-started/example-datasets/wikistat.md)
 -   [Terabyte of Click Logs from Criteo](../../getting-started/example-datasets/criteo.md)
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@ -1855,6 +1855,18 @@ Default value: `0`.
 -   [Distributed Table Engine](../../engines/table-engines/special/distributed.md#distributed)
 -   [Managing Distributed Tables](../../sql-reference/statements/system.md#query-language-system-distributed)

+## insert_distributed_one_random_shard {#insert_distributed_one_random_shard}
+
+Enables or disables random shard insertion into a [Distributed](../../engines/table-engines/special/distributed.md#distributed) table when there is no distributed key.
+
+By default, when inserting data into a `Distributed` table with more than one shard, the ClickHouse server will any insertion request if there is no distributed key. When `insert_distributed_one_random_shard = 1`, insertions are allowed and data is forwarded randomly among all shards.
+
+Possible values:
+
+-   0 — Insertion is rejected if there are multiple shards and no distributed key is given.
+-   1 — Insertion is done randomly among all available shards when no distributed key is given.
+
+Default value: `0`.

 ## use_compact_format_in_distributed_parts_names {#use_compact_format_in_distributed_parts_names}

@ -2447,7 +2459,6 @@ Result:
 {"number":"2"}
 ```

-=======
 ## allow_nullable_key {#allow-nullable-key}

 Allows using of the [Nullable](../../sql-reference/data-types/nullable.md#data_type-nullable)-typed values in a sorting and a primary key for [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md#table_engines-mergetree) tables.
--- a/docs/en/sql-reference/statements/alter/column.md
+++ b/docs/en/sql-reference/statements/alter/column.md
@ -29,12 +29,12 @@ These actions are described in detail below.
 ## ADD COLUMN {#alter_add-column}

 ``` sql
-ADD COLUMN [IF NOT EXISTS] name [type] [default_expr] [codec] [AFTER name_after]
+ADD COLUMN [IF NOT EXISTS] name [type] [default_expr] [codec] [AFTER name_after | FIRST]
 ```

 Adds a new column to the table with the specified `name`, `type`, [`codec`](../../../sql-reference/statements/create/table.md#codecs) and `default_expr` (see the section [Default expressions](../../../sql-reference/statements/create/table.md#create-default-values)).

-If the `IF NOT EXISTS` clause is included, the query won’t return an error if the column already exists. If you specify `AFTER name_after` (the name of another column), the column is added after the specified one in the list of table columns. Otherwise, the column is added to the end of the table. Note that there is no way to add a column to the beginning of a table. For a chain of actions, `name_after` can be the name of a column that is added in one of the previous actions.
+If the `IF NOT EXISTS` clause is included, the query won’t return an error if the column already exists. If you specify `AFTER name_after` (the name of another column), the column is added after the specified one in the list of table columns. If you want to add a column to the beginning of the table use the `FIRST` clause. Otherwise, the column is added to the end of the table. For a chain of actions, `name_after` can be the name of a column that is added in one of the previous actions.

 Adding a column just changes the table structure, without performing any actions with data. The data doesn’t appear on the disk after `ALTER`. If the data is missing for a column when reading from the table, it is filled in with default values (by performing the default expression if there is one, or using zeros or empty strings). The column appears on the disk after merging data parts (see [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md)).

@ -43,9 +43,24 @@ This approach allows us to complete the `ALTER` query instantly, without increas
 Example:

 ``` sql
-ALTER TABLE visits ADD COLUMN browser String AFTER user_id
+ALTER TABLE alter_test ADD COLUMN Added1 UInt32 FIRST;
+ALTER TABLE alter_test ADD COLUMN Added2 UInt32 AFTER NestedColumn;
+ALTER TABLE alter_test ADD COLUMN Added3 UInt32 AFTER ToDrop;
+DESC alter_test FORMAT TSV;
 ```

+``` text
+Added1  UInt32
+CounterID       UInt32
+StartDate       Date
+UserID  UInt32
+VisitID UInt32
+NestedColumn.A  Array(UInt8)
+NestedColumn.S  Array(String)
+Added2  UInt32
+ToDrop  UInt32
+Added3  UInt32
+```
 ## DROP COLUMN {#alter_drop-column}

 ``` sql
@ -99,7 +114,7 @@ ALTER TABLE visits COMMENT COLUMN browser 'The table shows the browser used for
 ## MODIFY COLUMN {#alter_modify-column}

 ``` sql
-MODIFY COLUMN [IF EXISTS] name [type] [default_expr] [TTL]
+MODIFY COLUMN [IF EXISTS] name [type] [default_expr] [TTL] [AFTER name_after | FIRST]
 ```

 This query changes the `name` column properties:
@ -114,6 +129,8 @@ This query changes the `name` column properties:

 If the `IF EXISTS` clause is specified, the query won’t return an error if the column doesn’t exist.

+The query also can change the order of the columns using `FIRST | AFTER` clause, see [ADD COLUMN](#alter_add-column) description.
+
 When changing the type, values are converted as if the [toType](../../../sql-reference/functions/type-conversion-functions.md) functions were applied to them. If only the default expression is changed, the query doesn’t do anything complex, and is completed almost instantly.

 Example:
@ -124,15 +141,7 @@ ALTER TABLE visits MODIFY COLUMN browser Array(String)

 Changing the column type is the only complex action – it changes the contents of files with data. For large tables, this may take a long time.

-There are several processing stages:
-
-   Preparing temporary (new) files with modified data.
-   Renaming old files.
-   Renaming the temporary (new) files to the old names.
-   Deleting the old files.
-
-Only the first stage takes time. If there is a failure at this stage, the data is not changed.
-If there is a failure during one of the successive stages, data can be restored manually. The exception is if the old files were deleted from the file system but the data for the new files did not get written to the disk and was lost.
+The `ALTER` query is atomic. For MergeTree tables it is also lock-free.

 The `ALTER` query for changing columns is replicated. The instructions are saved in ZooKeeper, then each replica applies them. All `ALTER` queries are run in the same order. The query waits for the appropriate actions to be completed on the other replicas. However, a query to change columns in a replicated table can be interrupted, and all actions will be performed asynchronously.

--- a/docs/ru/interfaces/formats.md
+++ b/docs/ru/interfaces/formats.md
@ -9,7 +9,6 @@ ClickHouse может принимать (`INSERT`) и отдавать (`SELECT

 Поддерживаемые форматы и возможность использовать их в запросах `INSERT` и `SELECT` перечислены в таблице ниже.

-=======
 | Формат                                                                                  | INSERT | SELECT |
 |-----------------------------------------------------------------------------------------|--------|--------|
 | [TabSeparated](#tabseparated)                                                           | ✔     | ✔      |
--- a/docs/ru/sql-reference/statements/alter/column.md
+++ b/docs/ru/sql-reference/statements/alter/column.md
@ -18,12 +18,12 @@ toc_title: "\u041c\u0430\u043d\u0438\u043f\u0443\u043b\u044f\u0446\u0438\u0438\u
 ## ADD COLUMN {#alter_add-column}

 ``` sql
-ADD COLUMN [IF NOT EXISTS] name [type] [default_expr] [codec] [AFTER name_after]
+ADD COLUMN [IF NOT EXISTS] name [type] [default_expr] [codec] [AFTER name_after | FIRST]
 ```

 Добавляет в таблицу новый столбец с именем `name`, типом `type`, [кодеком](../create/table.md#codecs) `codec` и выражением для умолчания `default_expr` (смотрите раздел [Значения по умолчанию](../create/index.md#create-default-values)).

-Если указано `IF NOT EXISTS`, запрос не будет возвращать ошибку, если столбец уже существует. Если указано `AFTER name_after` (имя другого столбца), то столбец добавляется (в список столбцов таблицы) после указанного. Иначе, столбец добавляется в конец таблицы. Обратите внимание, ClickHouse не позволяет добавлять столбцы в начало таблицы. Для цепочки действий, `name_after` может быть именем столбца, который добавляется в одном из предыдущих действий.
+Если указано `IF NOT EXISTS`, запрос не будет возвращать ошибку, если столбец уже существует. Если указано `AFTER name_after` (имя другого столбца), то столбец добавляется (в список столбцов таблицы) после указанного. Если вы хотите добавить столбец в начало таблицы, используйте `FIRST`. Иначе столбец добавляется в конец таблицы. Для цепочки действий `name_after` может быть именем столбца, который добавляется в одном из предыдущих действий.

 Добавление столбца всего лишь меняет структуру таблицы, и не производит никаких действий с данными - соответствующие данные не появляются на диске после ALTER-а. При чтении из таблицы, если для какого-либо столбца отсутствуют данные, то он заполняется значениями по умолчанию (выполняя выражение по умолчанию, если такое есть, или нулями, пустыми строками). Также, столбец появляется на диске при слиянии кусков данных (см. [MergeTree](../../../sql-reference/statements/alter/index.md)).

@ -32,7 +32,23 @@ ADD COLUMN [IF NOT EXISTS] name [type] [default_expr] [codec] [AFTER name_after]
 Пример:

 ``` sql
-ALTER TABLE visits ADD COLUMN browser String AFTER user_id
+ALTER TABLE alter_test ADD COLUMN Added1 UInt32 FIRST;
+ALTER TABLE alter_test ADD COLUMN Added2 UInt32 AFTER NestedColumn;
+ALTER TABLE alter_test ADD COLUMN Added3 UInt32 AFTER ToDrop;
+DESC alter_test FORMAT TSV;
+```
+
+``` text
+Added1  UInt32
+CounterID       UInt32
+StartDate       Date
+UserID  UInt32
+VisitID UInt32
+NestedColumn.A  Array(UInt8)
+NestedColumn.S  Array(String)
+Added2  UInt32
+ToDrop  UInt32
+Added3  UInt32
 ```

 ## DROP COLUMN {#alter_drop-column}
@ -88,7 +104,7 @@ ALTER TABLE visits COMMENT COLUMN browser 'Столбец показывает,
 ## MODIFY COLUMN {#alter_modify-column}

 ``` sql
-MODIFY COLUMN [IF EXISTS] name [type] [default_expr] [TTL]
+MODIFY COLUMN [IF EXISTS] name [type] [default_expr] [TTL] [AFTER name_after | FIRST]
 ```

 Запрос изменяет следующие свойства столбца `name`:
@ -103,6 +119,8 @@ MODIFY COLUMN [IF EXISTS] name [type] [default_expr] [TTL]

 Если указано `IF EXISTS`, запрос не возвращает ошибку, если столбца не существует.

+Запрос также может изменять порядок столбцов при помощи `FIRST | AFTER`, смотрите описание [ADD COLUMN](#alter_add-column).
+
 При изменении типа, значения преобразуются так, как если бы к ним была применена функция [toType](../../../sql-reference/statements/alter/index.md). Если изменяется только выражение для умолчания, запрос не делает никакой сложной работы и выполняется мгновенно.

 Пример запроса:
@ -113,15 +131,7 @@ ALTER TABLE visits MODIFY COLUMN browser Array(String)

 Изменение типа столбца - это единственное действие, которое выполняет сложную работу - меняет содержимое файлов с данными. Для больших таблиц, выполнение может занять длительное время.

-Выполнение производится в несколько стадий:
-
-   подготовка временных (новых) файлов с изменёнными данными;
-   переименование старых файлов;
-   переименование временных (новых) файлов в старые;
-   удаление старых файлов.
-
-Из них, длительной является только первая стадия. Если на этой стадии возникнет сбой, то данные не поменяются.
-Если на одной из следующих стадий возникнет сбой, то данные будет можно восстановить вручную. За исключением случаев, когда старые файлы удалены из файловой системы, а данные для новых файлов не доехали на диск и потеряны.
+Выполнение запроса ALTER атомарно.

 Запрос `ALTER` на изменение столбцов реплицируется. Соответствующие инструкции сохраняются в ZooKeeper, и затем каждая реплика их применяет. Все запросы `ALTER` выполняются в одном и том же порядке. Запрос ждёт выполнения соответствующих действий на всех репликах. Но при этом, запрос на изменение столбцов в реплицируемой таблице можно прервать, и все действия будут осуществлены асинхронно.

--- a/docs/tools/README.md
+++ b/docs/tools/README.md
@ -28,8 +28,8 @@ Follow the instructions on it's official website: <https://wkhtmltopdf.org/downl

 #### 2. Install CLI tools from npm

-1. `apt-get install npm` for Debian/Ubuntu or `brew install npm` on Mac OS X.
-2. `npm install -g purifycss amphtml-validator`.
+1. `sudo apt-get install npm` for Debian/Ubuntu or `brew install npm` on Mac OS X.
+2. `sudo npm install -g purify-css amphtml-validator`.

 #### 3. Set up virtualenv

--- a/docs/tools/build.py
+++ b/docs/tools/build.py
@ -48,11 +48,6 @@ def build_for_lang(lang, args):
    logging.info(f'Building {lang} docs')
    os.environ['SINGLE_PAGE'] = '0'

-    config_path = os.path.join(args.docs_dir, f'toc_{lang}.yml')
-    if args.is_stable_release and not os.path.exists(config_path):
-        logging.warning(f'Skipping {lang} docs, because {config} does not exist')
-        return
-
    try:
        theme_cfg = {
            'name': None,
@ -73,9 +68,7 @@ def build_for_lang(lang, args):
            'es': 'Español',
            'fr': 'Français',
            'ru': 'Русский',
-            'ja': '日本語',
-            'tr': 'Türkçe',
-            'fa': 'فارسی'
+            'ja': '日本語'
        }

        site_names = {
@ -84,31 +77,26 @@ def build_for_lang(lang, args):
            'es': 'Documentación de ClickHouse %s',
            'fr': 'Documentation ClickHouse %s',
            'ru': 'Документация ClickHouse %s',
-            'ja': 'ClickHouseドキュメント %s',
-            'tr': 'ClickHouse Belgeleri %s',
-            'fa': 'مستندات %sClickHouse'
+            'ja': 'ClickHouseドキュメント %s'
        }

        assert len(site_names) == len(languages)

-        if args.version_prefix:
-            site_dir = os.path.join(args.docs_output_dir, args.version_prefix, lang)
-        else:
-            site_dir = os.path.join(args.docs_output_dir, lang)
+        site_dir = os.path.join(args.docs_output_dir, lang)

        plugins = ['macros']
        if args.htmlproofer:
            plugins.append('htmlproofer')

        website_url = 'https://clickhouse.tech'
-        site_name = site_names.get(lang, site_names['en']) % args.version_prefix
+        site_name = site_names.get(lang, site_names['en']) % ''
        site_name = site_name.replace('  ', ' ')
        raw_config = dict(
            site_name=site_name,
            site_url=f'{website_url}/docs/{lang}/',
            docs_dir=os.path.join(args.docs_dir, lang),
            site_dir=site_dir,
-            strict=not args.version_prefix,
+            strict=True,
            theme=theme_cfg,
            copyright='©2016–2020 Yandex LLC',
            use_directory_urls=True,
@ -119,8 +107,6 @@ def build_for_lang(lang, args):
            plugins=plugins,
            extra=dict(
                now=datetime.datetime.now().isoformat(),
-                stable_releases=args.stable_releases,
-                version_prefix=args.version_prefix,
                single_page=False,
                rev=args.rev,
                rev_short=args.rev_short,
@ -134,23 +120,14 @@ def build_for_lang(lang, args):
            )
        )

-        if os.path.exists(config_path):
-            raw_config['config_file'] = config_path
-        else:
-            raw_config['nav'] = nav.build_docs_nav(lang, args)
+        raw_config['nav'] = nav.build_docs_nav(lang, args)

        cfg = config.load_config(**raw_config)

        if not args.skip_multi_page:
-            try:
-                mkdocs.commands.build.build(cfg)
-            except jinja2.exceptions.TemplateError:
-                if not args.version_prefix:
-                    raise
-                mdx_clickhouse.PatchedMacrosPlugin.disabled = True
-                mkdocs.commands.build.build(cfg)
+            mkdocs.commands.build.build(cfg)

-        if not (args.skip_amp or args.version_prefix):
+        if not args.skip_amp:
            amp.build_amp(lang, args, cfg)

        if not args.skip_single_page:
@ -170,8 +147,7 @@ def build_docs(args):
        if lang:
            tasks.append((lang, args,))
    util.run_function_in_parallel(build_for_lang, tasks, threads=False)
-    if not args.version_prefix:
-        redirects.build_docs_redirects(args)
+    redirects.build_docs_redirects(args)


 def build(args):
@ -188,8 +164,6 @@ def build(args):
        generate_cmake_flags_files()

        build_docs(args)
-        from github import build_releases
-        build_releases(args, build_docs)

    if not args.skip_blog:
        blog.build_blog(args)
@ -209,7 +183,7 @@ if __name__ == '__main__':
    website_dir = os.path.join(src_dir, 'website')

    arg_parser = argparse.ArgumentParser()
-    arg_parser.add_argument('--lang', default='en,es,fr,ru,zh,ja,tr,fa')
+    arg_parser.add_argument('--lang', default='en,es,fr,ru,zh,ja')
    arg_parser.add_argument('--blog-lang', default='en,ru')
    arg_parser.add_argument('--docs-dir', default='.')
    arg_parser.add_argument('--theme-dir', default=website_dir)
@ -217,12 +191,7 @@ if __name__ == '__main__':
    arg_parser.add_argument('--src-dir', default=src_dir)
    arg_parser.add_argument('--blog-dir', default=os.path.join(website_dir, 'blog'))
    arg_parser.add_argument('--output-dir', default='build')
-    arg_parser.add_argument('--enable-stable-releases', action='store_true')
-    arg_parser.add_argument('--stable-releases-limit', type=int, default='3')
-    arg_parser.add_argument('--lts-releases-limit', type=int, default='2')
    arg_parser.add_argument('--nav-limit', type=int, default='0')
-    arg_parser.add_argument('--version-prefix', type=str, default='')
-    arg_parser.add_argument('--is-stable-release', action='store_true')
    arg_parser.add_argument('--skip-multi-page', action='store_true')
    arg_parser.add_argument('--skip-single-page', action='store_true')
    arg_parser.add_argument('--skip-amp', action='store_true')
@ -252,8 +221,7 @@ if __name__ == '__main__':
    args.docs_output_dir = os.path.join(os.path.abspath(args.output_dir), 'docs')
    args.blog_output_dir = os.path.join(os.path.abspath(args.output_dir), 'blog')

-    from github import choose_latest_releases, get_events
-    args.stable_releases = choose_latest_releases(args) if args.enable_stable_releases else []
+    from github import get_events
    args.rev = subprocess.check_output('git rev-parse HEAD', shell=True).decode('utf-8').strip()
    args.rev_short = subprocess.check_output('git rev-parse --short HEAD', shell=True).decode('utf-8').strip()
    args.rev_url = f'https://github.com/ClickHouse/ClickHouse/commit/{args.rev}'
--- a/docs/tools/github.py
+++ b/docs/tools/github.py
@ -13,88 +13,6 @@ import requests
 import util


-def yield_candidates():
-    for page in range(1, 100):
-        url = f'https://api.github.com/repos/ClickHouse/ClickHouse/tags?per_page=100&page={page}'
-        github_token = os.getenv('GITHUB_TOKEN')
-        if github_token:
-            headers = {'authorization': f'OAuth {github_token}'}
-        else:
-            headers = {}
-        for candidate in requests.get(url, headers=headers).json():
-            yield candidate
-    time.sleep(random.random() * 3)
-
-
-def choose_latest_releases(args):
-    logging.info('Collecting release candidates')
-    seen_stable = collections.OrderedDict()
-    seen_lts = collections.OrderedDict()
-    candidates = []
-    stable_count = 0
-    lts_count = 0
-
-    for tag in yield_candidates():
-        if isinstance(tag, dict):
-            name = tag.get('name', '')
-            is_stable = 'stable' in name
-            is_lts = 'lts' in name
-            is_unstable = not (is_stable or is_lts)
-            is_in_blacklist = ('v18' in name) or ('prestable' in name) or ('v1.1' in name)
-            if is_unstable or is_in_blacklist:
-                continue
-            major_version = '.'.join((name.split('.', 2))[:2])
-            if major_version not in seen_lts:
-                if (stable_count >= args.stable_releases_limit) and (lts_count >= args.lts_releases_limit):
-                    break
-
-                payload = (name, tag.get('tarball_url'), is_lts,)
-                logging.debug(payload)
-                if is_lts:
-                    if lts_count < args.lts_releases_limit:
-                        seen_lts[major_version] = payload
-                        try:
-                            del seen_stable[major_version]
-                        except KeyError:
-                            pass
-                    lts_count += 1
-                else:
-                    if stable_count < args.stable_releases_limit:
-                        if major_version not in seen_stable:
-                            seen_stable[major_version] = payload
-                            stable_count += 1
-
-            logging.debug(
-                f'Stables: {stable_count}/{args.stable_releases_limit} LTS: {lts_count}/{args.lts_releases_limit}'
-            )
-        else:
-            logging.fatal('Unexpected GitHub response: %s', str(candidates))
-            sys.exit(1)
-
-    logging.info('Found LTS releases: %s', ', '.join(list(seen_lts.keys())))
-    logging.info('Found stable releases: %s', ', '.join(list(seen_stable.keys())))
-    return sorted(list(seen_lts.items()) + list(seen_stable.items()))
-
-
-def process_release(args, callback, release):
-    name, (full_name, tarball_url, is_lts,) = release
-    logging.info(f'Building docs for {full_name}')
-    buf = io.BytesIO(requests.get(tarball_url).content)
-    tar = tarfile.open(mode='r:gz', fileobj=buf)
-    with util.temp_dir() as base_dir:
-        tar.extractall(base_dir)
-        args = copy.copy(args)
-        args.version_prefix = name
-        args.is_stable_release = True
-        args.docs_dir = os.path.join(base_dir, os.listdir(base_dir)[0], 'docs')
-        callback(args)
-
-
-def build_releases(args, callback):
-    for release in args.stable_releases:
-        process_release(args, callback, release)
-
-
 def get_events(args):
    events = []
    skip = True
@ -118,12 +36,7 @@ def get_events(args):


 if __name__ == '__main__':
-    class DummyArgs(object):
-        lts_releases_limit = 1
-        stable_releases_limit = 3
    logging.basicConfig(
        level=logging.DEBUG,
        stream=sys.stderr
    )
-    for item in choose_latest_releases(DummyArgs()):
-        print(item)
--- a/docs/tools/mdx_clickhouse.py
+++ b/docs/tools/mdx_clickhouse.py
@ -145,24 +145,9 @@ class PatchedMacrosPlugin(macros.plugin.MacrosPlugin):
        if self.skip_git_log:
            return markdown
        src_path = page.file.abs_src_path
-        try:
-            git_log = subprocess.check_output(f'git log --follow --date=iso8601 "{src_path}"', shell=True)
-        except subprocess.CalledProcessError:
-            return markdown
-        max_date = None
-        min_date = None
-        for line in git_log.decode('utf-8').split('\n'):
-            if line.startswith('Date:'):
-                line = line.replace('Date:', '').strip().replace(' ', 'T', 1).replace(' ', '')
-                current_date = datetime.datetime.fromisoformat(line[:-2] + ':' + line[-2:])
-                if (not max_date) or current_date > max_date:
-                    max_date = current_date
-                if (not min_date) or current_date < min_date:
-                    min_date = current_date
-        if min_date:
-            page.meta['published_date'] = min_date
-        if max_date:
-            page.meta['modified_date'] = max_date
+
+        # There was a code that determined the minimum and maximum modification dates for a page.
+        # It was removed due to being obnoxiously slow.
        return markdown

    def render_impl(self, markdown):
--- a/docs/tools/purge_cache_for_changed_files.py
+++ b/docs/tools/purge_cache_for_changed_files.py
@ -1,78 +0,0 @@
-#!/usr/bin/env python3
-
-import subprocess
-import requests
-import os
-import time
-
-FNAME_START = "+++"
-
-CLOUDFLARE_URL = "https://api.cloudflare.com/client/v4/zones/4fc6fb1d46e87851605aa7fa69ca6fe0/purge_cache"
-
-# we have changes in revision and commit sha on all pages
-# so such changes have to be ignored
-MIN_CHANGED_WORDS = 4
-
-
-def collect_changed_files():
-    proc = subprocess.Popen("git diff HEAD~1 --word-diff=porcelain | grep -e '^+[^+]\|^\-[^\-]\|^\+\+\+'", stdout=subprocess.PIPE, shell=True)
-    changed_files = []
-    current_file_name = ""
-    changed_words = []
-    while True:
-        line = proc.stdout.readline().decode("utf-8").strip()
-        if not line:
-            break
-        if FNAME_START in line:
-            if changed_words:
-                if len(changed_words) > MIN_CHANGED_WORDS:
-                    changed_files.append(current_file_name)
-                changed_words = []
-            current_file_name = line[6:]
-        else:
-            changed_words.append(line)
-    return changed_files
-
-
-def filter_and_transform_changed_files(changed_files, base_domain):
-    result = []
-    for f in changed_files:
-        if f.endswith(".html"):
-            result.append(base_domain + f.replace("index.html", ""))
-    return result
-
-
-def convert_to_dicts(changed_files, batch_size):
-    result = []
-    current_batch = {"files": []}
-    for f in changed_files:
-        if len(current_batch["files"]) >= batch_size:
-            result.append(current_batch)
-            current_batch = {"files": []}
-        current_batch["files"].append(f)
-
-    if current_batch["files"]:
-        result.append(current_batch)
-    return result
-
-
-def post_data(prepared_batches, token):
-    headers = {"Authorization": "Bearer {}".format(token)}
-    for batch in prepared_batches:
-        print(("Pugring cache for", ", ".join(batch["files"])))
-        response = requests.post(CLOUDFLARE_URL, json=batch, headers=headers)
-        response.raise_for_status()
-        time.sleep(3)
-
-
-if __name__ == "__main__":
-    token = os.getenv("CLOUDFLARE_TOKEN")
-    if not token:
-        raise Exception("Env variable CLOUDFLARE_TOKEN is empty")
-    base_domain = os.getenv("BASE_DOMAIN", "https://content.clickhouse.tech/")
-    changed_files = collect_changed_files()
-    print(("Found", len(changed_files), "changed files"))
-    filtered_files = filter_and_transform_changed_files(changed_files, base_domain)
-    print(("Files rest after filtering", len(filtered_files)))
-    prepared_batches = convert_to_dicts(filtered_files, 25)
-    post_data(prepared_batches, token)
--- a/docs/tools/redirects.py
+++ b/docs/tools/redirects.py
@ -30,9 +30,8 @@ def build_redirect_html(args, base_prefix, lang, output_dir, from_path, to_path)
        output_dir, lang,
        from_path.replace('/index.md', '/index.html').replace('.md', '/index.html')
    )
-    version_prefix = f'/{args.version_prefix}/' if args.version_prefix else '/'
    target_path = to_path.replace('/index.md', '/').replace('.md', '/')
-    to_url = f'/{base_prefix}{version_prefix}{lang}/{target_path}'
+    to_url = f'/{base_prefix}/{lang}/{target_path}'
    to_url = to_url.strip()
    write_redirect_html(out_path, to_url)

--- a/docs/tools/release.sh
+++ b/docs/tools/release.sh
@ -7,19 +7,22 @@ PUBLISH_DIR="${BASE_DIR}/../publish"
 BASE_DOMAIN="${BASE_DOMAIN:-content.clickhouse.tech}"
 GIT_TEST_URI="${GIT_TEST_URI:-git@github.com:ClickHouse/clickhouse-website-content.git}"
 GIT_PROD_URI="git@github.com:ClickHouse/clickhouse-website-content.git"
-EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS:---enable-stable-releases --minify --verbose}"
-HISTORY_SIZE="${HISTORY_SIZE:-5}"
+EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS:---minify --verbose}"

 if [[ -z "$1" ]]
 then
    source "${BASE_DIR}/venv/bin/activate"
    python3 "${BASE_DIR}/build.py" ${EXTRA_BUILD_ARGS}
-    rm -rf "${PUBLISH_DIR}" || true
-    git clone "${GIT_TEST_URI}" "${PUBLISH_DIR}"
-    cd "${PUBLISH_DIR}"
+    rm -rf "${PUBLISH_DIR}"
+    mkdir "${PUBLISH_DIR}" && cd "${PUBLISH_DIR}"
+
+    # Will make a repository with website content as the only commit.
+    git init
+    git remote add origin "${GIT_TEST_URI}"
    git config user.email "robot-clickhouse@yandex-team.ru"
    git config user.name "robot-clickhouse"
-    git rm -rf *
+
+    # Add files.
    cp -R "${BUILD_DIR}"/* .
    echo -n "${BASE_DOMAIN}" > CNAME
    echo -n "" > README.md
@ -27,16 +30,16 @@ then
    cp "${BASE_DIR}/../../LICENSE" .
    git add *
    git add ".nojekyll"
-    git commit -a -m "add new release at $(date)"
-    NEW_ROOT_COMMIT=$(git rev-parse "HEAD~${HISTORY_SIZE}")
-    git checkout --orphan temp "${NEW_ROOT_COMMIT}"
-    git commit -m "root commit"
-    git rebase --onto temp "${NEW_ROOT_COMMIT}" master
-    git branch -D temp
-    git push -f origin master
+
+    # Push to GitHub rewriting the existing contents.
+    git commit --quiet -m "Add new release at $(date)"
+    git push --force origin master
+
    if [[ ! -z "${CLOUDFLARE_TOKEN}" ]]
    then
        sleep 1m
-        python3 "${BASE_DIR}/purge_cache_for_changed_files.py"
+        # https://api.cloudflare.com/#zone-purge-files-by-cache-tags,-host-or-prefix
+        POST_DATA='{"hosts":["content.clickhouse.tech"]}'
+        curl -X POST "https://api.cloudflare.com/client/v4/zones/4fc6fb1d46e87851605aa7fa69ca6fe0/purge_cache" -H "Authorization: Bearer ${CLOUDFLARE_TOKEN}" -H "Content-Type:application/json" --data "${POST_DATA}"
    fi
 fi
--- a/docs/tools/single_page.py
+++ b/docs/tools/single_page.py
@ -111,10 +111,7 @@ def build_single_page_version(lang, args, nav, cfg):
                if not args.test_only:
                    mkdocs.commands.build.build(cfg)

-                    if args.version_prefix:
-                        single_page_output_path = os.path.join(args.docs_dir, args.docs_output_dir, args.version_prefix, lang, 'single')
-                    else:
-                        single_page_output_path = os.path.join(args.docs_dir, args.docs_output_dir, lang, 'single')
+                    single_page_output_path = os.path.join(args.docs_dir, args.docs_output_dir, lang, 'single')

                    if os.path.exists(single_page_output_path):
                        shutil.rmtree(single_page_output_path)
@ -157,10 +154,9 @@ def build_single_page_version(lang, args, nav, cfg):
                    if args.save_raw_single_page:
                        shutil.copytree(test_dir, args.save_raw_single_page)

-                    if not args.version_prefix:  # maybe enable in future
-                        logging.info(f'Running tests for {lang}')
-                        test.test_single_page(
-                            os.path.join(test_dir, 'single', 'index.html'), lang)
+                    logging.info(f'Running tests for {lang}')
+                    test.test_single_page(
+                        os.path.join(test_dir, 'single', 'index.html'), lang)

                    if not args.skip_pdf:
                        single_page_index_html = os.path.join(test_dir, 'single', 'index.html')
--- a/docs/tools/translate/translate.py
+++ b/docs/tools/translate/translate.py
@ -11,8 +11,6 @@ import googletrans
 import requests
 import yaml

-import typograph_ru
-

 translator = googletrans.Translator()
 default_target_language = os.environ.get('TARGET_LANGUAGE', 'ru')
@ -25,8 +23,6 @@ def translate_impl(text, target_language=None):
    target_language = target_language or default_target_language
    if target_language == 'en':
        return text
-    elif target_language == 'typograph_ru':
-        return typograph_ru.typograph(text)
    elif is_yandex:
        text = text.replace('‘', '\'')
        text = text.replace('’', '\'')
@ -59,25 +55,10 @@ def translate(text, target_language=None):
    )


-def translate_toc(root, lang):
-    global is_yandex
-    is_yandex = True
-    if isinstance(root, dict):
-        result = []
-        for key, value in root.items():
-            key = translate(key, lang) if key != 'hidden' and not key.isupper() else key
-            result.append((key, translate_toc(value, lang),))
-        return dict(result)
-    elif isinstance(root, list):
-        return [translate_toc(item, lang) for item in root]
-    elif isinstance(root, str):
-        return root
-
-
 def translate_po():
    import babel.messages.pofile
    base_dir = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'website', 'locale')
-    for lang in ['en', 'zh', 'es', 'fr', 'ru', 'ja', 'tr', 'fa']:
+    for lang in ['en', 'zh', 'es', 'fr', 'ru', 'ja']:
        po_path = os.path.join(base_dir, lang, 'LC_MESSAGES', 'messages.po')
        with open(po_path, 'r') as f:
            po_file = babel.messages.pofile.read_po(f, locale=lang, domain='messages')
--- a/docs/tools/website.py
+++ b/docs/tools/website.py
@ -232,6 +232,7 @@ def minify_website(args):
            f"'{args.output_dir}/docs/en/**/*.html' '{args.website_dir}/js/**/*.js' > {css_out}"
    else:
        command = f'cat {css_in} > {css_out}'
+
    logging.info(command)
    output = subprocess.check_output(command, shell=True)
    logging.debug(output)
--- a/docs/zh/sql-reference/data-types/datetime64.md
+++ b/docs/zh/sql-reference/data-types/datetime64.md
@ -7,9 +7,9 @@ toc_title: DateTime64

 # Datetime64 {#data_type-datetime64}

-允许存储时间instant间，可以表示为日历日期和一天中的时间，具有定义的亚秒精度
+此类型允许以日期（date）加时间（time）的形式来存储一个时刻的时间值，具有定义的亚秒精度

-刻度尺寸（精度）：10<sup>-精度</sup> 秒
+时间刻度大小（精度）：10<sup>-精度</sup> 秒

 语法:

@ -17,11 +17,11 @@ toc_title: DateTime64
 DateTime64(precision, [timezone])
 ```

-在内部，存储数据作为一些 ‘ticks’ 自纪元开始(1970-01-01 00:00:00UTC)作为Int64. 刻度分辨率由precision参数确定。 此外，该 `DateTime64` 类型可以存储时区是相同的整个列，影响如何的值 `DateTime64` 类型值以文本格式显示，以及如何解析指定为字符串的值 (‘2020-01-01 05:00:01.000’). 时区不存储在表的行中（或resultset中），而是存储在列元数据中。 查看详细信息 [日期时间](datetime.md).
+在内部，此类型以Int64类型将数据存储为自Linux纪元开始(1970-01-01 00:00:00UTC)的时间刻度数（ticks）。时间刻度的分辨率由precision参数确定。此外，`DateTime64` 类型可以像存储其他数据列一样存储时区信息，时区会影响 `DateTime64` 类型的值如何以文本格式显示，以及如何解析以字符串形式指定的时间数据 (‘2020-01-01 05:00:01.000’)。时区不存储在表的行中（也不在resultset中），而是存储在列的元数据中。详细信息请参考 [DateTime](datetime.md) 数据类型.

-## 例 {#examples}
+## 示例 {#examples}

-**1.** 创建一个表 `DateTime64`-输入列并将数据插入其中:
+**1.** 创建一个具有 `DateTime64` 类型列的表，并向其中插入数据:

 ``` sql
 CREATE TABLE dt
@ -47,10 +47,10 @@ SELECT * FROM dt
 └─────────────────────────┴──────────┘
 ```

-   将日期时间作为整数插入时，将其视为适当缩放的Unix时间戳(UTC)。 `1546300800000` （精度为3）表示 `'2019-01-01 00:00:00'` UTC. 然而，作为 `timestamp` 列有 `Europe/Moscow` （UTC+3）指定的时区，当输出为字符串时，该值将显示为 `'2019-01-01 03:00:00'`
-   当插入字符串值作为日期时间时，它被视为处于列时区。 `'2019-01-01 00:00:00'` 将被视为 `Europe/Moscow` 时区并存储为 `1546290000000`.
+-   将日期时间作为integer类型插入时，它会被视为适当缩放的Unix时间戳(UTC)。`1546300800000` （精度为3）表示 `'2019-01-01 00:00:00'` UTC. 不过，因为 `timestamp` 列指定了 `Europe/Moscow` （UTC+3）的时区，当作为字符串输出时，它将显示为 `'2019-01-01 03:00:00'`
+-   当把字符串作为日期时间插入时，它会被赋予时区信息。 `'2019-01-01 00:00:00'` 将被认为处于 `Europe/Moscow` 时区并被存储为 `1546290000000`.

-**2.** 过滤 `DateTime64` 值
+**2.** 过滤 `DateTime64` 类型的值

 ``` sql
 SELECT * FROM dt WHERE timestamp = toDateTime64('2019-01-01 00:00:00', 3, 'Europe/Moscow')
@ -62,9 +62,9 @@ SELECT * FROM dt WHERE timestamp = toDateTime64('2019-01-01 00:00:00', 3, 'Europ
 └─────────────────────────┴──────────┘
 ```

-不像 `DateTime`, `DateTime64` 值不转换为 `String` 自动
+与 `DateTime` 不同, `DateTime64` 类型的值不会自动从 `String` 类型的值转换过来

-**3.** 获取一个时区 `DateTime64`-类型值:
+**3.** 获取 `DateTime64` 类型值的时区信息:

 ``` sql
 SELECT toDateTime64(now(), 3, 'Europe/Moscow') AS column, toTypeName(column) AS x
@ -97,8 +97,9 @@ FROM dt
 -   [类型转换函数](../../sql-reference/functions/type-conversion-functions.md)
 -   [用于处理日期和时间的函数](../../sql-reference/functions/date-time-functions.md)
 -   [用于处理数组的函数](../../sql-reference/functions/array-functions.md)
-   [该 `date_time_input_format` 设置](../../operations/settings/settings.md#settings-date_time_input_format)
-   [该 `timezone` 服务器配置参数](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone)
-   [使用日期和时间的操作员](../../sql-reference/operators/index.md#operators-datetime)
+-   [`date_time_input_format` 配置](../../operations/settings/settings.md#settings-date_time_input_format)
+-   [`date_time_output_format` 配置](../../operations/settings/settings.md#settings-date_time_output_format)
+-   [`timezone` 服务器配置参数](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone)
+-   [用于处理日期和时间的算子](../../sql-reference/operators/index.md#operators-datetime)
 -   [`Date` 数据类型](date.md)
 -   [`DateTime` 数据类型](datetime.md)
--- a/programs/client/Client.cpp
+++ b/programs/client/Client.cpp
@ -949,6 +949,11 @@ private:
            TestHint test_hint(test_mode, all_queries_text);
            if (test_hint.clientError() || test_hint.serverError())
                processTextAsSingleQuery("SET send_logs_level = 'none'");
+
+            // Echo all queries if asked; makes for a more readable reference
+            // file.
+            if (test_hint.echoQueries())
+                echo_queries = true;
        }

        /// Several queries separated by ';'.
--- a/programs/client/QueryFuzzer.cpp
+++ b/programs/client/QueryFuzzer.cpp
@ -14,6 +14,7 @@
 #include <Parsers/ASTIdentifier.h>
 #include <Parsers/ASTInsertQuery.h>
 #include <Parsers/ASTLiteral.h>
+#include <Parsers/ASTOrderByElement.h>
 #include <Parsers/ASTQueryWithOutput.h>
 #include <Parsers/ASTSelectQuery.h>
 #include <Parsers/ASTSelectWithUnionQuery.h>
@ -28,6 +29,11 @@
 namespace DB
 {

+namespace ErrorCodes
+{
+    extern const int TOO_DEEP_RECURSION;
+}
+
 Field QueryFuzzer::getRandomField(int type)
 {
    switch (type)
@ -205,14 +211,88 @@ void QueryFuzzer::replaceWithTableLike(ASTPtr & ast)
    ast = new_ast;
 }

-void QueryFuzzer::fuzzColumnLikeExpressionList(ASTPtr ast)
+void QueryFuzzer::fuzzOrderByElement(ASTOrderByElement * elem)
+{
+    switch (fuzz_rand() % 10)
+    {
+        case 0:
+            elem->direction = -1;
+            break;
+        case 1:
+            elem->direction = 1;
+            break;
+        case 2:
+            elem->nulls_direction = -1;
+            elem->nulls_direction_was_explicitly_specified = true;
+            break;
+        case 3:
+            elem->nulls_direction = 1;
+            elem->nulls_direction_was_explicitly_specified = true;
+            break;
+        case 4:
+            elem->nulls_direction = elem->direction;
+            elem->nulls_direction_was_explicitly_specified = false;
+            break;
+        default:
+            // do nothing
+            break;
+    }
+}
+
+void QueryFuzzer::fuzzOrderByList(IAST * ast)
 {
    if (!ast)
    {
        return;
    }

-    auto * impl = assert_cast<ASTExpressionList *>(ast.get());
+    auto * list = assert_cast<ASTExpressionList *>(ast);
+
+    // Remove element
+    if (fuzz_rand() % 50 == 0 && list->children.size() > 1)
+    {
+        // Don't remove last element -- this leads to questionable
+        // constructs such as empty select.
+        list->children.erase(list->children.begin()
+                             + fuzz_rand() % list->children.size());
+    }
+
+    // Add element
+    if (fuzz_rand() % 50 == 0)
+    {
+        auto pos = list->children.empty()
+                ? list->children.begin()
+                : list->children.begin() + fuzz_rand() % list->children.size();
+        auto col = getRandomColumnLike();
+        if (col)
+        {
+            auto elem = std::make_shared<ASTOrderByElement>();
+            elem->children.push_back(col);
+            elem->direction = 1;
+            elem->nulls_direction = 1;
+            elem->nulls_direction_was_explicitly_specified = false;
+            elem->with_fill = false;
+
+            list->children.insert(pos, elem);
+        }
+        else
+        {
+            fprintf(stderr, "no random col!\n");
+        }
+    }
+
+    // We don't have to recurse here to fuzz the children, this is handled by
+    // the generic recursion into IAST.children.
+}
+
+void QueryFuzzer::fuzzColumnLikeExpressionList(IAST * ast)
+{
+    if (!ast)
+    {
+        return;
+    }
+
+    auto * impl = assert_cast<ASTExpressionList *>(ast);

    // Remove element
    if (fuzz_rand() % 50 == 0 && impl->children.size() > 1)
@ -252,11 +332,44 @@ void QueryFuzzer::fuzz(ASTs & asts)
    }
 }

+struct ScopedIncrement
+{
+    size_t & counter;
+
+    explicit ScopedIncrement(size_t & counter_) : counter(counter_) { ++counter; }
+    ~ScopedIncrement() { --counter; }
+};
+
 void QueryFuzzer::fuzz(ASTPtr & ast)
 {
    if (!ast)
        return;

+    // Check for exceeding max depth.
+    ScopedIncrement depth_increment(current_ast_depth);
+    if (current_ast_depth > 500)
+    {
+        // The AST is too deep (see the comment for current_ast_depth). Throw
+        // an exception to fail fast and not use this query as an etalon, or we'll
+        // end up in a very slow and useless loop. It also makes sense to set it
+        // lower than the default max parse depth on the server (1000), so that
+        // we don't get the useless error about parse depth from the server either.
+        throw Exception(ErrorCodes::TOO_DEEP_RECURSION,
+            "AST depth exceeded while fuzzing ({})", current_ast_depth);
+    }
+
+    // Check for loops.
+    auto [_, inserted] = debug_visited_nodes.insert(ast.get());
+    if (!inserted)
+    {
+        fmt::print(stderr, "The AST node '{}' was already visited before."
+            " Depth {}, {} visited nodes, current top AST:\n{}\n",
+            static_cast<void *>(ast.get()), current_ast_depth,
+            debug_visited_nodes.size(), (*debug_top_ast)->dumpTree());
+        assert(false);
+    }
+
+    // The fuzzing.
    if (auto * with_union = typeid_cast<ASTSelectWithUnionQuery *>(ast.get()))
    {
        fuzz(with_union->list_of_selects);
@ -281,17 +394,28 @@ void QueryFuzzer::fuzz(ASTPtr & ast)
    {
        fuzz(expr_list->children);
    }
+    else if (auto * order_by_element = typeid_cast<ASTOrderByElement *>(ast.get()))
+    {
+        fuzzOrderByElement(order_by_element);
+    }
    else if (auto * fn = typeid_cast<ASTFunction *>(ast.get()))
    {
-        fuzzColumnLikeExpressionList(fn->arguments);
-        fuzzColumnLikeExpressionList(fn->parameters);
+        fuzzColumnLikeExpressionList(fn->arguments.get());
+        fuzzColumnLikeExpressionList(fn->parameters.get());
+
+        if (fn->is_window_function)
+        {
+            fuzzColumnLikeExpressionList(fn->window_partition_by);
+            fuzzOrderByList(fn->window_order_by);
+        }

        fuzz(fn->children);
    }
    else if (auto * select = typeid_cast<ASTSelectQuery *>(ast.get()))
    {
-        fuzzColumnLikeExpressionList(select->select());
-        fuzzColumnLikeExpressionList(select->groupBy());
+        fuzzColumnLikeExpressionList(select->select().get());
+        fuzzColumnLikeExpressionList(select->groupBy().get());
+        fuzzOrderByList(select->orderBy().get());

        fuzz(select->children);
    }
@ -416,6 +540,10 @@ void QueryFuzzer::collectFuzzInfoRecurse(const ASTPtr ast)

 void QueryFuzzer::fuzzMain(ASTPtr & ast)
 {
+    current_ast_depth = 0;
+    debug_visited_nodes.clear();
+    debug_top_ast = &ast;
+
    collectFuzzInfoMain(ast);
    fuzz(ast);

--- a/programs/client/QueryFuzzer.h
+++ b/programs/client/QueryFuzzer.h
@ -12,6 +12,9 @@
 namespace DB
 {

+class ASTExpressionList;
+class ASTOrderByElement;
+
 /*
 * This is an AST-based query fuzzer that makes random modifications to query
 * AST, changing numbers, list of columns, functions, etc. It remembers part of
@ -23,6 +26,13 @@ struct QueryFuzzer
 {
    pcg64 fuzz_rand{randomSeed()};

+    // We add elements to expression lists with fixed probability. Some elements
+    // are so large, that the expected number of elements we add to them is
+    // one or higher, hence this process might never finish. Put some limit on the
+    // total depth of AST to prevent this.
+    // This field is reset for each fuzzMain() call.
+    size_t current_ast_depth = 0;
+
    // These arrays hold parts of queries that we can substitute into the query
    // we are currently fuzzing. We add some part from each new query we are asked
    // to fuzz, and keep this state between queries, so the fuzzing output becomes
@ -36,6 +46,12 @@ struct QueryFuzzer
    std::unordered_map<std::string, ASTPtr> table_like_map;
    std::vector<ASTPtr> table_like;

+    // Some debug fields for detecting problematic ASTs with loops.
+    // These are reset for each fuzzMain call.
+    std::unordered_set<const IAST *> debug_visited_nodes;
+    ASTPtr * debug_top_ast;
+
+
    // This is the only function you have to call -- it will modify the passed
    // ASTPtr to point to new AST with some random changes.
    void fuzzMain(ASTPtr & ast);
@ -46,7 +62,9 @@ struct QueryFuzzer
    ASTPtr getRandomColumnLike();
    void replaceWithColumnLike(ASTPtr & ast);
    void replaceWithTableLike(ASTPtr & ast);
-    void fuzzColumnLikeExpressionList(ASTPtr ast);
+    void fuzzOrderByElement(ASTOrderByElement * elem);
+    void fuzzOrderByList(IAST * ast);
+    void fuzzColumnLikeExpressionList(IAST * ast);
    void fuzz(ASTs & asts);
    void fuzz(ASTPtr & ast);
    void collectFuzzInfoMain(const ASTPtr ast);
--- a/programs/client/TestHint.h
+++ b/programs/client/TestHint.h
@ -19,6 +19,7 @@ namespace ErrorCodes

 /// Checks expected server and client error codes in testmode.
 /// To enable it add special comment after the query: "-- { serverError 60 }" or "-- { clientError 20 }".
+/// Also you can enable echoing all queries by writing "-- { echo }".
 class TestHint
 {
 public:
@ -84,12 +85,14 @@ public:

    int serverError() const { return server_error; }
    int clientError() const { return client_error; }
+    bool echoQueries() const { return echo; }

 private:
    bool enabled = false;
    const String & query;
    int server_error = 0;
    int client_error = 0;
+    bool echo = false;

    void parse(const String & hint)
    {
@ -107,6 +110,8 @@ private:
                ss >> server_error;
            else if (item == "clientError")
                ss >> client_error;
+            else if (item == "echo")
+                echo = true;
        }
    }

--- a/programs/install/Install.cpp
+++ b/programs/install/Install.cpp
@ -10,6 +10,10 @@
    #include <linux/capability.h>
 #endif

+#if defined(OS_DARWIN)
+    #include <mach-o/dyld.h>
+#endif
+
 #include <Common/Exception.h>
 #include <Common/ShellCommand.h>
 #include <Common/formatReadable.h>
@ -147,9 +151,24 @@ int mainEntryClickHouseInstall(int argc, char ** argv)
    try
    {
        /// We need to copy binary to the binary directory.
-        /// The binary is currently run. We need to obtain its path from procfs.
+        /// The binary is currently run. We need to obtain its path from procfs (on Linux).

+#if defined(OS_DARWIN)
+        uint32_t path_length = 0;
+        _NSGetExecutablePath(nullptr, &path_length);
+        if (path_length <= 1)
+            Exception(ErrorCodes::FILE_DOESNT_EXIST, "Cannot obtain path to the binary");
+
+        std::string path(path_length, std::string::value_type());
+        auto res = _NSGetExecutablePath(&path[0], &path_length);
+        if (res != 0)
+            Exception(ErrorCodes::FILE_DOESNT_EXIST, "Cannot obtain path to the binary");
+
+        fs::path binary_self_path(path);
+#else
        fs::path binary_self_path = "/proc/self/exe";
+#endif
+
        if (!fs::exists(binary_self_path))
            throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Cannot obtain path to the binary from {}, file doesn't exist",
                            binary_self_path.string());
--- a/programs/main.cpp
+++ b/programs/main.cpp
@ -308,53 +308,11 @@ void checkRequiredInstructions()
    }
 }

-#ifdef __linux__
-/// clickhouse uses jemalloc as a production allocator
-/// and jemalloc relies on working MADV_DONTNEED,
-/// which doesn't work under qemu
-///
-/// but do this only under for linux, since only it return zeroed pages after MADV_DONTNEED
-/// (and jemalloc assumes this too, see contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/internal/jemalloc_internal_defs.h.in)
-void checkRequiredMadviseFlags()
-{
-    size_t size = 1 << 16;
-    void * addr = mmap(nullptr, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
-    if (addr == MAP_FAILED)
-    {
-        writeError("Can not mmap pages for MADV_DONTNEED check\n");
-        _Exit(1);
-    }
-    memset(addr, 'A', size);
-
-    if (!madvise(addr, size, MADV_DONTNEED))
-    {
-        /// Suboptimal, but should be simple.
-        for (size_t i = 0; i < size; ++i)
-        {
-            if (reinterpret_cast<unsigned char *>(addr)[i] != 0)
-            {
-                writeError("MADV_DONTNEED does not zeroed page. jemalloc will be broken\n");
-                _Exit(1);
-            }
-        }
-    }
-
-    if (munmap(addr, size))
-    {
-        writeError("Can not munmap pages for MADV_DONTNEED check\n");
-        _Exit(1);
-    }
-}
-#endif
-
 struct Checker
 {
    Checker()
    {
        checkRequiredInstructions();
-#ifdef __linux__
-        checkRequiredMadviseFlags();
-#endif
    }
 } checker;

--- a/programs/odbc-bridge/ODBCBridge.cpp
+++ b/programs/odbc-bridge/ODBCBridge.cpp
@ -89,7 +89,7 @@ void ODBCBridge::defineOptions(Poco::Util::OptionSet & options)
 {
    options.addOption(Poco::Util::Option("http-port", "", "port to listen").argument("http-port", true).binding("http-port"));
    options.addOption(
-        Poco::Util::Option("listen-host", "", "hostname to listen, default localhost").argument("listen-host").binding("listen-host"));
+        Poco::Util::Option("listen-host", "", "hostname or address to listen, default 127.0.0.1").argument("listen-host").binding("listen-host"));
    options.addOption(
        Poco::Util::Option("http-timeout", "", "http timeout for socket, default 1800").argument("http-timeout").binding("http-timeout"));

@ -161,7 +161,7 @@ void ODBCBridge::initialize(Application & self)
    BaseDaemon::logRevision();

    log = &logger();
-    hostname = config().getString("listen-host", "localhost");
+    hostname = config().getString("listen-host", "127.0.0.1");
    port = config().getUInt("http-port");
    if (port > 0xFFFF)
        throw Exception("Out of range 'http-port': " + std::to_string(port), ErrorCodes::ARGUMENT_OUT_OF_BOUND);
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -4,6 +4,7 @@
 #include <sys/resource.h>
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <sys/wait.h>
 #include <errno.h>
 #include <pwd.h>
 #include <unistd.h>
@ -103,6 +104,12 @@ namespace CurrentMetrics
 int mainEntryClickHouseServer(int argc, char ** argv)
 {
    DB::Server app;
+
+    /// Do not fork separate process from watchdog if we attached to terminal.
+    /// Otherwise it breaks gdb usage.
+    if (argc > 0 && !isatty(STDIN_FILENO) && !isatty(STDOUT_FILENO) && !isatty(STDERR_FILENO))
+        app.shouldSetupWatchdog(argv[0]);
+
    try
    {
        return app.run(argc, argv);
@ -366,6 +373,7 @@ void checkForUsersNotInMainConfig(
 int Server::main(const std::vector<std::string> & /*args*/)
 {
    Poco::Logger * log = &logger();
+
    UseSSL use_ssl;

    MainThreadStatus::getInstance();
@ -770,7 +778,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
    http_params->setTimeout(settings.http_receive_timeout);
    http_params->setKeepAliveTimeout(keep_alive_timeout);

-    std::vector<ProtocolServerAdapter> servers_to_start_before_tables;
+    auto servers_to_start_before_tables = std::make_shared<std::vector<ProtocolServerAdapter>>();

    std::vector<std::string> listen_hosts = DB::getMultipleValuesFromConfig(config(), "", "listen_host");

@ -792,7 +800,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
            auto address = socketBindListen(socket, listen_host, port);
            socket.setReceiveTimeout(settings.receive_timeout);
            socket.setSendTimeout(settings.send_timeout);
-            servers_to_start_before_tables.emplace_back(
+            servers_to_start_before_tables->emplace_back(
                port_name,
                std::make_unique<Poco::Net::TCPServer>(
                    new TestKeeperTCPHandlerFactory(*this), server_pool, socket, new Poco::Net::TCPServerParams));
@ -801,7 +809,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
        });
    }

-    for (auto & server : servers_to_start_before_tables)
+    for (auto & server : *servers_to_start_before_tables)
        server.start();

    SCOPE_EXIT({
@ -816,11 +824,11 @@ int Server::main(const std::vector<std::string> & /*args*/)

        LOG_DEBUG(log, "Shut down storages.");

-        if (!servers_to_start_before_tables.empty())
+        if (!servers_to_start_before_tables->empty())
        {
            LOG_DEBUG(log, "Waiting for current connections to servers for tables to finish.");
            int current_connections = 0;
-            for (auto & server : servers_to_start_before_tables)
+            for (auto & server : *servers_to_start_before_tables)
            {
                server.stop();
                current_connections += server.currentConnections();
@ -832,7 +840,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
                LOG_INFO(log, "Closed all listening sockets.");

            if (current_connections > 0)
-                current_connections = waitServersToFinish(servers_to_start_before_tables, config().getInt("shutdown_wait_unfinished", 5));
+                current_connections = waitServersToFinish(*servers_to_start_before_tables, config().getInt("shutdown_wait_unfinished", 5));

            if (current_connections)
                LOG_INFO(log, "Closed connections to servers for tables. But {} remain. Probably some tables of other users cannot finish their connections after context shutdown.", current_connections);
@ -978,7 +986,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
    LOG_INFO(log, "TaskStats is not implemented for this OS. IO accounting will be disabled.");
 #endif

-    std::vector<ProtocolServerAdapter> servers;
+    auto servers = std::make_shared<std::vector<ProtocolServerAdapter>>();
    {
        /// This object will periodically calculate some metrics.
        AsynchronousMetrics async_metrics(
@ -996,7 +1004,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
                socket.setReceiveTimeout(settings.http_receive_timeout);
                socket.setSendTimeout(settings.http_send_timeout);

-                servers.emplace_back(port_name, std::make_unique<Poco::Net::HTTPServer>(
+                servers->emplace_back(port_name, std::make_unique<Poco::Net::HTTPServer>(
                    createHandlerFactory(*this, async_metrics, "HTTPHandler-factory"), server_pool, socket, http_params));

                LOG_INFO(log, "Listening for http://{}", address.toString());
@ -1011,7 +1019,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
                auto address = socketBindListen(socket, listen_host, port, /* secure = */ true);
                socket.setReceiveTimeout(settings.http_receive_timeout);
                socket.setSendTimeout(settings.http_send_timeout);
-                servers.emplace_back(port_name, std::make_unique<Poco::Net::HTTPServer>(
+                servers->emplace_back(port_name, std::make_unique<Poco::Net::HTTPServer>(
                    createHandlerFactory(*this, async_metrics, "HTTPSHandler-factory"), server_pool, socket, http_params));

                LOG_INFO(log, "Listening for https://{}", address.toString());
@ -1030,7 +1038,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
                auto address = socketBindListen(socket, listen_host, port);
                socket.setReceiveTimeout(settings.receive_timeout);
                socket.setSendTimeout(settings.send_timeout);
-                servers.emplace_back(port_name, std::make_unique<Poco::Net::TCPServer>(
+                servers->emplace_back(port_name, std::make_unique<Poco::Net::TCPServer>(
                    new TCPHandlerFactory(*this, /* secure */ false, /* proxy protocol */ false),
                    server_pool,
                    socket,
@ -1047,7 +1055,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
                auto address = socketBindListen(socket, listen_host, port);
                socket.setReceiveTimeout(settings.receive_timeout);
                socket.setSendTimeout(settings.send_timeout);
-                servers.emplace_back(port_name, std::make_unique<Poco::Net::TCPServer>(
+                servers->emplace_back(port_name, std::make_unique<Poco::Net::TCPServer>(
                    new TCPHandlerFactory(*this, /* secure */ false, /* proxy protocol */ true),
                    server_pool,
                    socket,
@ -1065,7 +1073,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
                auto address = socketBindListen(socket, listen_host, port, /* secure = */ true);
                socket.setReceiveTimeout(settings.receive_timeout);
                socket.setSendTimeout(settings.send_timeout);
-                servers.emplace_back(port_name, std::make_unique<Poco::Net::TCPServer>(
+                servers->emplace_back(port_name, std::make_unique<Poco::Net::TCPServer>(
                    new TCPHandlerFactory(*this, /* secure */ true, /* proxy protocol */ false),
                    server_pool,
                    socket,
@ -1086,7 +1094,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
                auto address = socketBindListen(socket, listen_host, port);
                socket.setReceiveTimeout(settings.http_receive_timeout);
                socket.setSendTimeout(settings.http_send_timeout);
-                servers.emplace_back(port_name, std::make_unique<Poco::Net::HTTPServer>(
+                servers->emplace_back(port_name, std::make_unique<Poco::Net::HTTPServer>(
                    createHandlerFactory(*this, async_metrics, "InterserverIOHTTPHandler-factory"), server_pool, socket, http_params));

                LOG_INFO(log, "Listening for replica communication (interserver): http://{}", address.toString());
@ -1100,7 +1108,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
                auto address = socketBindListen(socket, listen_host, port, /* secure = */ true);
                socket.setReceiveTimeout(settings.http_receive_timeout);
                socket.setSendTimeout(settings.http_send_timeout);
-                servers.emplace_back(port_name, std::make_unique<Poco::Net::HTTPServer>(
+                servers->emplace_back(port_name, std::make_unique<Poco::Net::HTTPServer>(
                    createHandlerFactory(*this, async_metrics, "InterserverIOHTTPSHandler-factory"), server_pool, socket, http_params));

                LOG_INFO(log, "Listening for secure replica communication (interserver): https://{}", address.toString());
@ -1118,7 +1126,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
                auto address = socketBindListen(socket, listen_host, port, /* secure = */ true);
                socket.setReceiveTimeout(Poco::Timespan());
                socket.setSendTimeout(settings.send_timeout);
-                servers.emplace_back(port_name, std::make_unique<Poco::Net::TCPServer>(
+                servers->emplace_back(port_name, std::make_unique<Poco::Net::TCPServer>(
                    new MySQLHandlerFactory(*this),
                    server_pool,
                    socket,
@ -1134,7 +1142,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
                auto address = socketBindListen(socket, listen_host, port, /* secure = */ true);
                socket.setReceiveTimeout(Poco::Timespan());
                socket.setSendTimeout(settings.send_timeout);
-                servers.emplace_back(port_name, std::make_unique<Poco::Net::TCPServer>(
+                servers->emplace_back(port_name, std::make_unique<Poco::Net::TCPServer>(
                    new PostgreSQLHandlerFactory(*this),
                    server_pool,
                    socket,
@ -1148,7 +1156,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
            createServer(listen_host, port_name, listen_try, [&](UInt16 port)
            {
                Poco::Net::SocketAddress server_address(listen_host, port);
-                servers.emplace_back(port_name, std::make_unique<GRPCServer>(*this, makeSocketAddress(listen_host, port, log)));
+                servers->emplace_back(port_name, std::make_unique<GRPCServer>(*this, makeSocketAddress(listen_host, port, log)));
                LOG_INFO(log, "Listening for gRPC protocol: " + server_address.toString());
            });
 #endif
@ -1161,14 +1169,14 @@ int Server::main(const std::vector<std::string> & /*args*/)
                auto address = socketBindListen(socket, listen_host, port);
                socket.setReceiveTimeout(settings.http_receive_timeout);
                socket.setSendTimeout(settings.http_send_timeout);
-                servers.emplace_back(port_name, std::make_unique<Poco::Net::HTTPServer>(
+                servers->emplace_back(port_name, std::make_unique<Poco::Net::HTTPServer>(
                    createHandlerFactory(*this, async_metrics, "PrometheusHandler-factory"), server_pool, socket, http_params));

                LOG_INFO(log, "Listening for Prometheus: http://{}", address.toString());
            });
        }

-        if (servers.empty())
+        if (servers->empty())
             throw Exception("No servers started (add valid listen_host and 'tcp_port' or 'http_port' to configuration file.)",
                ErrorCodes::NO_ELEMENTS_IN_CONFIG);

@ -1176,7 +1184,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
        async_metrics.start();
        global_context->enableNamedSessions();

-        for (auto & server : servers)
+        for (auto & server : *servers)
            server.start();

        {
@ -1208,7 +1216,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
            is_cancelled = true;

            int current_connections = 0;
-            for (auto & server : servers)
+            for (auto & server : *servers)
            {
                server.stop();
                current_connections += server.currentConnections();
@ -1223,7 +1231,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
            global_context->getProcessList().killAllQueries();

            if (current_connections)
-                current_connections = waitServersToFinish(servers, config().getInt("shutdown_wait_unfinished", 5));
+                current_connections = waitServersToFinish(*servers, config().getInt("shutdown_wait_unfinished", 5));

            if (current_connections)
                LOG_INFO(log, "Closed connections. But {} remain."
--- a/src/AggregateFunctions/AggregateFunctionAvg.h
+++ b/src/AggregateFunctions/AggregateFunctionAvg.h
@ -127,10 +127,10 @@ public:
    void insertResultInto(AggregateDataPtr place, IColumn & to, Arena *) const override
    {
        if constexpr (IsDecimalNumber<Numerator> || IsDecimalNumber<Denominator>)
-            static_cast<ColumnVector<Float64> &>(to).getData().push_back(
+            assert_cast<ColumnVector<Float64> &>(to).getData().push_back(
                this->data(place).divideIfAnyDecimal(num_scale, denom_scale));
        else
-            static_cast<ColumnVector<Float64> &>(to).getData().push_back(this->data(place).divide());
+            assert_cast<ColumnVector<Float64> &>(to).getData().push_back(this->data(place).divide());
    }
 private:
    UInt32 num_scale;
--- a/src/AggregateFunctions/IAggregateFunction.h
+++ b/src/AggregateFunctions/IAggregateFunction.h
@ -104,9 +104,12 @@ public:
        return false;
    }

-    /// Inserts results into a column.
-    /// This method must be called once, from single thread.
-    /// After this method was called for state, you can't do anything with state but destroy.
+    /// Inserts results into a column. This method might modify the state (e.g.
+    /// sort an array), so must be called once, from single thread. The state
+    /// must remain valid though, and the subsequent calls to add/merge/
+    /// insertResultInto must work correctly. This kind of call sequence occurs
+    /// in `runningAccumulate`, or when calculating an aggregate function as a
+    /// window function.
    virtual void insertResultInto(AggregateDataPtr place, IColumn & to, Arena * arena) const = 0;

    /// Used for machine learning methods. Predict result from trained model.
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -363,7 +363,7 @@ endif ()

 if (USE_PARQUET)
    dbms_target_link_libraries(PRIVATE ${PARQUET_LIBRARY})
-    if (NOT USE_INTERNAL_PARQUET_LIBRARY OR USE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE)
+    if (NOT USE_INTERNAL_PARQUET_LIBRARY)
        dbms_target_include_directories (SYSTEM BEFORE PRIVATE ${PARQUET_INCLUDE_DIR} ${ARROW_INCLUDE_DIR})
        if (USE_STATIC_LIBRARIES)
            dbms_target_link_libraries(PRIVATE ${ARROW_LIBRARY})
@ -436,6 +436,8 @@ if (USE_ROCKSDB)
    dbms_target_include_directories(SYSTEM BEFORE PUBLIC ${ROCKSDB_INCLUDE_DIR})
 endif()

+dbms_target_link_libraries(PRIVATE _boost_context)
+
 if (ENABLE_TESTS AND USE_GTEST)
    macro (grep_gtest_sources BASE_DIR DST_VAR)
        # Cold match files that are not in tests/ directories
--- a/src/Client/Connection.cpp
+++ b/src/Client/Connection.cpp
@ -742,8 +742,11 @@ std::optional<UInt64> Connection::checkPacket(size_t timeout_microseconds)
 }


-Packet Connection::receivePacket()
+Packet Connection::receivePacket(std::function<void(Poco::Net::Socket &)> async_callback)
 {
+    in->setAsyncCallback(std::move(async_callback));
+    SCOPE_EXIT(in->setAsyncCallback({}));
+
    try
    {
        Packet res;
--- a/src/Client/Connection.h
+++ b/src/Client/Connection.h
@ -18,6 +18,7 @@
 #include <DataStreams/BlockStreamProfileInfo.h>

 #include <IO/ConnectionTimeouts.h>
+#include <IO/ReadBufferFromPocoSocket.h>

 #include <Interpreters/TablesStatus.h>

@ -171,7 +172,8 @@ public:
    std::optional<UInt64> checkPacket(size_t timeout_microseconds = 0);

    /// Receive packet from server.
-    Packet receivePacket();
+    /// Each time read blocks and async_callback is set, it will be called. You can poll socket inside it.
+    Packet receivePacket(std::function<void(Poco::Net::Socket &)> async_callback = {});

    /// If not connected yet, or if connection is broken - then connect. If cannot connect - throw an exception.
    void forceConnected(const ConnectionTimeouts & timeouts);
@ -226,7 +228,7 @@ private:
    String server_display_name;

    std::unique_ptr<Poco::Net::StreamSocket> socket;
-    std::shared_ptr<ReadBuffer> in;
+    std::shared_ptr<ReadBufferFromPocoSocket> in;
    std::shared_ptr<WriteBuffer> out;
    std::optional<UInt64> last_input_packet_type;

--- a/src/Client/MultiplexedConnections.cpp
+++ b/src/Client/MultiplexedConnections.cpp
@ -237,7 +237,7 @@ std::string MultiplexedConnections::dumpAddressesUnlocked() const
    return buf.str();
 }

-Packet MultiplexedConnections::receivePacketUnlocked()
+Packet MultiplexedConnections::receivePacketUnlocked(std::function<void(Poco::Net::Socket &)> async_callback)
 {
    if (!sent_query)
        throw Exception("Cannot receive packets: no query sent.", ErrorCodes::LOGICAL_ERROR);
@ -249,7 +249,7 @@ Packet MultiplexedConnections::receivePacketUnlocked()
    if (current_connection == nullptr)
        throw Exception("Logical error: no available replica", ErrorCodes::NO_AVAILABLE_REPLICA);

-    Packet packet = current_connection->receivePacket();
+    Packet packet = current_connection->receivePacket(std::move(async_callback));

    switch (packet.type)
    {
--- a/src/Client/MultiplexedConnections.h
+++ b/src/Client/MultiplexedConnections.h
@ -69,7 +69,7 @@ public:

 private:
    /// Internal version of `receivePacket` function without locking.
-    Packet receivePacketUnlocked();
+    Packet receivePacketUnlocked(std::function<void(Poco::Net::Socket &)> async_callback = {});

    /// Internal version of `dumpAddresses` function without locking.
    std::string dumpAddressesUnlocked() const;
@ -105,6 +105,8 @@ private:
    /// A mutex for the sendCancel function to execute safely
    /// in separate thread.
    mutable std::mutex cancel_mutex;
+
+    friend class RemoteQueryExecutorReadContext;
 };

 }
--- a/src/Common/CounterInFile.h
+++ b/src/Common/CounterInFile.h
@ -87,7 +87,7 @@ public:
                {
                    /// A more understandable error message.
                    if (e.code() == DB::ErrorCodes::CANNOT_READ_ALL_DATA || e.code() == DB::ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
-                        throw DB::Exception("File " + path + " is empty. You must fill it manually with appropriate value.", e.code());
+                        throw DB::ParsingException("File " + path + " is empty. You must fill it manually with appropriate value.", e.code());
                    else
                        throw;
                }
--- a/src/Common/Exception.cpp
+++ b/src/Common/Exception.cpp
@ -450,5 +450,49 @@ ExecutionStatus ExecutionStatus::fromCurrentException(const std::string & start_
    return ExecutionStatus(getCurrentExceptionCode(), msg);
 }

+ParsingException::ParsingException()
+{
+    Exception::message(Exception::message() + "{}");
+}
+
+ParsingException::ParsingException(const std::string & msg, int code)
+    : Exception(msg, code)
+{
+    Exception::message(Exception::message() + "{}");
+}
+
+ParsingException::ParsingException(int code, const std::string & message)
+    : Exception(message, code)
+{
+    Exception::message(Exception::message() + "{}");
+}
+
+
+/// We use additional field formatted_message_ to make this method const.
+std::string ParsingException::displayText() const
+{
+    try
+    {
+        if (line_number_ == -1)
+            formatted_message_ = fmt::format(message(), "");
+        else
+            formatted_message_ = fmt::format(message(), fmt::format(": (at row {})\n", line_number_));
+    }
+    catch (...)
+    {}
+
+    if (!formatted_message_.empty())
+    {
+        std::string result = name();
+        result.append(": ");
+        result.append(formatted_message_);
+        return result;
+    }
+    else
+    {
+        return Exception::displayText();
+    }
+}
+

 }
--- a/src/Common/Exception.h
+++ b/src/Common/Exception.h
@ -96,6 +96,38 @@ private:
 };


+/// Special class of exceptions, used mostly in ParallelParsingInputFormat for
+/// more convinient calculation of problem line number.
+class ParsingException : public Exception
+{
+public:
+    ParsingException();
+    ParsingException(const std::string & msg, int code);
+    ParsingException(int code, const std::string & message);
+
+    // Format message with fmt::format, like the logging functions.
+    template <typename ...Args>
+    ParsingException(int code, const std::string & fmt, Args&&... args)
+        : Exception(fmt::format(fmt, std::forward<Args>(args)...), code)
+    {
+        Exception::message(Exception::message() + "{}");
+    }
+
+
+    std::string displayText() const override;
+
+    int getLineNumber() { return line_number_; }
+    void setLineNumber(int line_number) { line_number_ = line_number;}
+
+private:
+    ssize_t line_number_{-1};
+    mutable std::string formatted_message_;
+
+    const char * name() const throw() override { return "DB::ParsingException"; }
+    const char * className() const throw() override { return "DB::ParsingException"; }
+};
+
+
 using Exceptions = std::vector<std::exception_ptr>;


--- a/src/Common/Fiber.h
+++ b/src/Common/Fiber.h
@ -0,0 +1,5 @@
+#pragma once
+#include <common/defines.h>
+#include <boost/context/fiber.hpp>
+
+using Fiber = boost::context::fiber;
--- a/src/Common/FiberStack.h
+++ b/src/Common/FiberStack.h
@ -0,0 +1,74 @@
+#pragma once
+#include <common/defines.h>
+#include <boost/context/stack_context.hpp>
+#include <Common/formatReadable.h>
+#include <Common/MemoryTracker.h>
+
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/mman.h>
+
+#if defined(BOOST_USE_VALGRIND)
+#include <valgrind/valgrind.h>
+#endif
+
+namespace DB::ErrorCodes
+{
+    extern const int CANNOT_ALLOCATE_MEMORY;
+}
+
+/// This is an implementation of allocator for fiber stack.
+/// The reference implementation is protected_fixedsize_stack from boost::context.
+/// This implementation additionally track memory usage. It is the main reason why it is needed.
+class FiberStack
+{
+private:
+    size_t stack_size;
+    size_t page_size = 0;
+public:
+    static constexpr size_t default_stack_size = 128 * 1024; /// 64KB was not enough for tests
+
+    explicit FiberStack(size_t stack_size_ = default_stack_size) : stack_size(stack_size_)
+    {
+        page_size = ::sysconf(_SC_PAGESIZE);
+    }
+
+    boost::context::stack_context allocate()
+    {
+        size_t num_pages = 1 + (stack_size - 1) / page_size;
+        size_t num_bytes = (num_pages + 1) * page_size; /// Add one page at bottom that will be used as guard-page
+
+        void * vp = ::mmap(nullptr, num_bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+        if (MAP_FAILED == vp)
+            DB::throwFromErrno(fmt::format("FiberStack: Cannot mmap {}.", ReadableSize(num_bytes)), DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY);
+
+        if (-1 == ::mprotect(vp, page_size, PROT_NONE))
+        {
+            ::munmap(vp, num_bytes);
+            DB::throwFromErrno("FiberStack: cannot protect guard page", DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY);
+        }
+
+        /// Do not count guard page in memory usage.
+        CurrentMemoryTracker::alloc(num_pages * page_size);
+
+        boost::context::stack_context sctx;
+        sctx.size = num_bytes;
+        sctx.sp = static_cast< char * >(vp) + sctx.size;
+#if defined(BOOST_USE_VALGRIND)
+        sctx.valgrind_stack_id = VALGRIND_STACK_REGISTER(sctx.sp, vp);
+#endif
+        return sctx;
+    }
+
+    void deallocate(boost::context::stack_context & sctx)
+    {
+#if defined(BOOST_USE_VALGRIND)
+        VALGRIND_STACK_DEREGISTER(sctx.valgrind_stack_id);
+#endif
+        void * vp = static_cast< char * >(sctx.sp) - sctx.size;
+        ::munmap(vp, sctx.size);
+
+        /// Do not count guard page in memory usage.
+        CurrentMemoryTracker::free(sctx.size - page_size);
+    }
+};
--- a/src/Common/SimpleIncrement.h
+++ b/src/Common/SimpleIncrement.h
@ -8,9 +8,7 @@
  */
 struct SimpleIncrement
 {
-    std::atomic<UInt64> value;
-
-    SimpleIncrement(UInt64 start = 0) : value(start) {}
+    std::atomic<UInt64> value{0};

    void set(UInt64 new_value)
    {
--- a/src/Common/ThreadFuzzer.cpp
+++ b/src/Common/ThreadFuzzer.cpp
@ -10,6 +10,7 @@
 #include <common/sleep.h>

 #include <IO/ReadHelpers.h>
+#include <common/logger_useful.h>

 #include <Common/Exception.h>
 #include <Common/thread_local_rng.h>
--- a/src/Common/ThreadPool.cpp
+++ b/src/Common/ThreadPool.cpp
@ -55,6 +55,13 @@ void ThreadPoolImpl<Thread>::setMaxThreads(size_t value)
    max_threads = value;
 }

+template <typename Thread>
+size_t ThreadPoolImpl<Thread>::getMaxThreads() const
+{
+    std::lock_guard lock(mutex);
+    return max_threads;
+}
+
 template <typename Thread>
 void ThreadPoolImpl<Thread>::setMaxFreeThreads(size_t value)
 {
--- a/src/Common/ThreadPool.h
+++ b/src/Common/ThreadPool.h
@ -71,6 +71,7 @@ public:
    void setMaxThreads(size_t value);
    void setMaxFreeThreads(size_t value);
    void setQueueSize(size_t value);
+    size_t getMaxThreads() const;

 private:
    mutable std::mutex mutex;
--- a/src/Common/TimerDescriptor.cpp
+++ b/src/Common/TimerDescriptor.cpp
@ -0,0 +1,84 @@
+#if defined(OS_LINUX)
+#include <Common/TimerDescriptor.h>
+#include <Common/Exception.h>
+
+#include <sys/timerfd.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int CANNOT_CREATE_TIMER;
+    extern const int CANNOT_SET_TIMER_PERIOD;
+    extern const int CANNOT_FCNTL;
+    extern const int CANNOT_READ_FROM_SOCKET;
+}
+
+TimerDescriptor::TimerDescriptor(int clockid, int flags)
+{
+    timer_fd = timerfd_create(clockid, flags);
+    if (timer_fd == -1)
+        throw Exception(ErrorCodes::CANNOT_CREATE_TIMER, "Cannot create timer_fd descriptor");
+
+    if (-1 == fcntl(timer_fd, F_SETFL, O_NONBLOCK))
+        throwFromErrno("Cannot set O_NONBLOCK for timer_fd", ErrorCodes::CANNOT_FCNTL);
+}
+
+TimerDescriptor::~TimerDescriptor()
+{
+    /// Do not check for result cause cannot throw exception.
+    close(timer_fd);
+}
+
+void TimerDescriptor::reset() const
+{
+    itimerspec spec;
+    spec.it_interval.tv_nsec = 0;
+    spec.it_interval.tv_sec = 0;
+    spec.it_value.tv_sec = 0;
+    spec.it_value.tv_nsec = 0;
+
+    if (-1 == timerfd_settime(timer_fd, 0 /*relative timer */, &spec, nullptr))
+        throwFromErrno("Cannot reset timer_fd", ErrorCodes::CANNOT_SET_TIMER_PERIOD);
+
+    /// Drain socket.
+    /// It may be possible that alarm happened and socket is readable.
+    drain();
+}
+
+void TimerDescriptor::drain() const
+{
+    /// It is expected that socket returns 8 bytes when readable.
+    /// Read in loop anyway cause signal may interrupt read call.
+    uint64_t buf;
+    while (true)
+    {
+        ssize_t res = ::read(timer_fd, &buf, sizeof(buf));
+        if (res < 0)
+        {
+            if (errno == EAGAIN)
+                break;
+
+            if (errno != EINTR)
+                throwFromErrno("Cannot drain timer_fd", ErrorCodes::CANNOT_READ_FROM_SOCKET);
+        }
+    }
+}
+
+void TimerDescriptor::setRelative(const Poco::Timespan & timespan) const
+{
+    itimerspec spec;
+    spec.it_interval.tv_nsec = 0;
+    spec.it_interval.tv_sec = 0;
+    spec.it_value.tv_sec = timespan.totalSeconds();
+    spec.it_value.tv_nsec = timespan.useconds();
+
+    if (-1 == timerfd_settime(timer_fd, 0 /*relative timer */, &spec, nullptr))
+        throwFromErrno("Cannot set time for timer_fd", ErrorCodes::CANNOT_SET_TIMER_PERIOD);
+}
+
+}
+#endif
--- a/src/Common/TimerDescriptor.h
+++ b/src/Common/TimerDescriptor.h
@ -0,0 +1,31 @@
+#pragma once
+#if defined(OS_LINUX)
+#include <Poco/Timespan.h>
+
+namespace DB
+{
+
+/// Wrapper over timerfd.
+class TimerDescriptor
+{
+private:
+    int timer_fd;
+
+public:
+    explicit TimerDescriptor(int clockid, int flags);
+    ~TimerDescriptor();
+
+    TimerDescriptor(const TimerDescriptor &) = delete;
+    TimerDescriptor & operator=(const TimerDescriptor &) = delete;
+    TimerDescriptor(TimerDescriptor &&) = default;
+    TimerDescriptor & operator=(TimerDescriptor &&) = default;
+
+    int getDescriptor() const { return timer_fd; }
+
+    void reset() const;
+    void drain() const;
+    void setRelative(const Poco::Timespan & timespan) const;
+};
+
+}
+#endif
--- a/src/Common/XDBCBridgeHelper.h
+++ b/src/Common/XDBCBridgeHelper.h
@ -76,7 +76,7 @@ public:
    const Context & context;
    const Configuration & config;

-    static constexpr inline auto DEFAULT_HOST = "localhost";
+    static constexpr inline auto DEFAULT_HOST = "127.0.0.1";
    static constexpr inline auto DEFAULT_PORT = BridgeHelperMixin::DEFAULT_PORT;
    static constexpr inline auto PING_HANDLER = "/ping";
    static constexpr inline auto MAIN_HANDLER = "/";
--- a/src/Common/ZooKeeper/tests/CMakeLists.txt
+++ b/src/Common/ZooKeeper/tests/CMakeLists.txt
@ -4,9 +4,6 @@ target_link_libraries(zkutil_test_commands PRIVATE clickhouse_common_zookeeper)
 add_executable(zkutil_test_commands_new_lib zkutil_test_commands_new_lib.cpp)
 target_link_libraries(zkutil_test_commands_new_lib PRIVATE clickhouse_common_zookeeper string_utils)

-add_executable(zkutil_expiration_test zkutil_expiration_test.cpp)
-target_link_libraries(zkutil_expiration_test PRIVATE clickhouse_common_zookeeper)
-
 add_executable(zkutil_test_async zkutil_test_async.cpp)
 target_link_libraries(zkutil_test_async PRIVATE clickhouse_common_zookeeper)

--- a/src/Common/ZooKeeper/tests/nozk.sh
+++ b/src/Common/ZooKeeper/tests/nozk.sh
@ -1,15 +0,0 @@
-#!/usr/bin/env bash
-
-# Добавляет в файрвол правила, не пропускающие пакеты до серверов ZooKeeper.
-# Используется для тестирования поведения программ при потере соединения с ZooKeeper.
-# yeszk.sh производит обратные изменения.
-
-# Чтобы посмотреть, какие правила сейчас есть, используйте sudo iptables -L и sudo ip6tables -L
-
-sudo iptables -A OUTPUT -p tcp --dport 2181 -j DROP
-sudo ip6tables -A OUTPUT -p tcp --dport 2181 -j DROP
-
-# You could also test random drops:
-#sudo iptables -A OUTPUT -p tcp --dport 2181 -j REJECT --reject-with tcp-reset -m statistic --mode random --probability 0.1
-#sudo ip6tables -A OUTPUT -p tcp --dport 2181 -j REJECT --reject-with tcp-reset -m statistic --mode random --probability 0.1
-
--- a/src/Common/ZooKeeper/tests/yeszk.sh
+++ b/src/Common/ZooKeeper/tests/yeszk.sh
@ -1,6 +0,0 @@
-#!/usr/bin/env bash
-
-# Выполняет действия, обратные nozk.sh
-
-cat nozk.sh | sed 's/-A/-D/g' | bash
-
--- a/src/Common/ZooKeeper/tests/zkutil_expiration_test.cpp
+++ b/src/Common/ZooKeeper/tests/zkutil_expiration_test.cpp
@ -1,70 +0,0 @@
-#include <iostream>
-#include <Common/ZooKeeper/ZooKeeper.h>
-#include <Common/ZooKeeper/KeeperException.h>
-#include <Poco/ConsoleChannel.h>
-#include <Common/Exception.h>
-
-
-/// Проверяет, какие ошибки выдает ZooKeeper при попытке сделать какую-нибудь операцию через разное время после истечения сессии.
-/// Спойлер: multi иногда падает с segfault, а до этого фейлится с marshalling error.
-///          create всегда фейлится с invalid zhandle state.
-
-int main(int argc, char ** argv)
-{
-    try
-    {
-        if (argc != 2)
-        {
-            std::cerr << "usage: " << argv[0] << " hosts" << std::endl;
-            return 2;
-        }
-
-        Poco::AutoPtr<Poco::ConsoleChannel> channel = new Poco::ConsoleChannel(std::cerr);
-        Poco::Logger::root().setChannel(channel);
-        Poco::Logger::root().setLevel("trace");
-
-        zkutil::ZooKeeper zk(argv[1]);
-        std::string unused;
-        zk.tryCreate("/test", "", zkutil::CreateMode::Persistent, unused);
-
-        std::cerr << "Please run `./nozk.sh && sleep 40s && ./yeszk.sh`" << std::endl;
-
-        time_t time0 = time(nullptr);
-
-        while (true)
-        {
-            {
-                Coordination::Requests ops;
-                ops.emplace_back(zkutil::makeCreateRequest("/test/zk_expiration_test", "hello", zkutil::CreateMode::Persistent));
-                ops.emplace_back(zkutil::makeRemoveRequest("/test/zk_expiration_test", -1));
-
-                Coordination::Responses responses;
-                Coordination::Error code = zk.tryMultiNoThrow(ops, responses);
-
-                std::cout << time(nullptr) - time0 << "s: " << Coordination::errorMessage(code) << std::endl;
-                try
-                {
-                    if (code != Coordination::Error::ZOK)
-                        std::cout << "Path: " << zkutil::KeeperMultiException(code, ops, responses).getPathForFirstFailedOp() << std::endl;
-                }
-                catch (...)
-                {
-                    std::cout << DB::getCurrentExceptionMessage(false) << std::endl;
-                }
-
-            }
-
-            sleep(1);
-        }
-    }
-    catch (Coordination::Exception &)
-    {
-        std::cerr << "KeeperException: " << DB::getCurrentExceptionMessage(true) << std::endl;
-        return 1;
-    }
-    catch (...)
-    {
-        std::cerr << "Some exception: " << DB::getCurrentExceptionMessage(true) << std::endl;
-        return 2;
-    }
-}
--- a/src/Common/ya.make
+++ b/src/Common/ya.make
@ -75,6 +75,7 @@ SRCS(
    ThreadPool.cpp
    ThreadProfileEvents.cpp
    ThreadStatus.cpp
+    TimerDescriptor.cpp
    TraceCollector.cpp
    UTF8Helpers.cpp
    UnicodeBar.cpp
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -405,16 +405,21 @@ class IColumn;
    M(Bool, optimize_skip_merged_partitions, false, "Skip partitions with one part with level > 0 in optimize final", 0) \
    M(Bool, optimize_on_insert, true, "Do the same transformation for inserted block of data as if merge was done on this block.", 0) \
    M(Bool, allow_experimental_map_type, false, "Allow data type Map", 0) \
+    M(Bool, allow_experimental_window_functions, false, "Allow experimental window functions", 0) \
    \
    M(Bool, use_antlr_parser, false, "Parse incoming queries using ANTLR-generated parser", 0) \
    \
+    M(Bool, async_socket_for_remote, true, "Asynchronously read from socket executing remote query", 0) \
+    \
    /** Obsolete settings that do nothing but left for compatibility reasons. Remove each one after half a year of obsolescence. */ \
    \
    M(UInt64, max_memory_usage_for_all_queries, 0, "Obsolete. Will be removed after 2020-10-20", 0) \
    M(UInt64, multiple_joins_rewriter_version, 0, "Obsolete setting, does nothing. Will be removed after 2021-03-31", 0) \
    M(Bool, enable_debug_queries, false, "Enabled debug queries, but now is obsolete", 0) \
    M(Bool, allow_experimental_database_atomic, true, "Obsolete setting, does nothing. Will be removed after 2021-02-12", 0) \
-    M(UnionMode, union_default_mode, UnionMode::DISTINCT, "Set default Union Mode in SelectWithUnion query. Possible values: empty string, 'ALL', 'DISTINCT'. If empty, query without Union Mode will throw exception.", 0)
+    M(UnionMode, union_default_mode, UnionMode::DISTINCT, "Set default Union Mode in SelectWithUnion query. Possible values: empty string, 'ALL', 'DISTINCT'. If empty, query without Union Mode will throw exception.", 0) \
+    M(Bool, optimize_aggregators_of_group_by_keys, true, "Eliminates min/max/any/anyLast aggregators of GROUP BY keys in SELECT section", 0) \
+    M(Bool, optimize_group_by_function_keys, true, "Eliminates functions of other keys in GROUP BY section", 0) \

 // End of COMMON_SETTINGS
 // Please add settings related to formats into the FORMAT_FACTORY_SETTINGS below.
@ -426,10 +431,10 @@ class IColumn;
    M(Bool, output_format_csv_crlf_end_of_line, false, "If it is set true, end of line in CSV format will be \\r\\n instead of \\n.", 0) \
    M(Bool, input_format_csv_unquoted_null_literal_as_null, false, "Consider unquoted NULL literal as \\N", 0) \
    M(Bool, input_format_csv_enum_as_number, false, "Treat inserted enum values in CSV formats as enum indices \\N", 0) \
+    M(Bool, input_format_csv_arrays_as_nested_csv, false, R"(When reading Array from CSV, expect that its elements were serialized in nested CSV and then put into string. Example: "[""Hello"", ""world"", ""42"""" TV""]". Braces around array can be omitted.)", 0) \
    M(Bool, input_format_skip_unknown_fields, false, "Skip columns with unknown names from input data (it works for JSONEachRow, CSVWithNames, TSVWithNames and TSKV formats).", 0) \
    M(Bool, input_format_with_names_use_header, true, "For TSVWithNames and CSVWithNames input formats this controls whether format parser is to assume that column data appear in the input exactly as they are specified in the header.", 0) \
    M(Bool, input_format_import_nested_json, false, "Map nested JSON data to nested tables (it works for JSONEachRow format).", 0) \
-    M(Bool, optimize_aggregators_of_group_by_keys, true, "Eliminates min/max/any/anyLast aggregators of GROUP BY keys in SELECT section", 0) \
    M(Bool, input_format_defaults_for_omitted_fields, true, "For input data calculate default expressions for omitted fields (it works for JSONEachRow, CSV and TSV formats).", IMPORTANT) \
    M(Bool, input_format_tsv_empty_as_default, false, "Treat empty fields in TSV input as default values.", 0) \
    M(Bool, input_format_tsv_enum_as_number, false, "Treat inserted enum values in TSV formats as enum indices \\N", 0) \
@ -438,7 +443,6 @@ class IColumn;
    M(DateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic' and 'best_effort'.", 0) \
    M(DateTimeOutputFormat, date_time_output_format, FormatSettings::DateTimeOutputFormat::Simple, "Method to write DateTime to text output. Possible values: 'simple', 'iso', 'unix_timestamp'.", 0) \
    \
-    M(Bool, optimize_group_by_function_keys, true, "Eliminates functions of other keys in GROUP BY section", 0) \
    M(Bool, input_format_values_interpret_expressions, true, "For Values format: if the field could not be parsed by streaming parser, run SQL parser and try to interpret it as SQL expression.", 0) \
    M(Bool, input_format_values_deduce_templates_of_expressions, true, "For Values format: if the field could not be parsed by streaming parser, run SQL parser, deduce template of the SQL expression, try to parse all rows using template and then interpret expression for all rows.", 0) \
    M(Bool, input_format_values_accurate_types_of_literals, true, "For Values format: when parsing and interpreting expressions using template, check actual type of literal to avoid possible overflow and precision issues.", 0) \
@ -486,7 +490,9 @@ class IColumn;
    \
    M(Bool, output_format_enable_streaming, false, "Enable streaming in output formats that support it.", 0) \
    M(Bool, output_format_write_statistics, true, "Write statistics about read rows, bytes, time elapsed in suitable output formats.", 0) \
-    M(Bool, output_format_pretty_row_numbers, false, "Add row numbers before each row for pretty output format", 0)
+    M(Bool, output_format_pretty_row_numbers, false, "Add row numbers before each row for pretty output format", 0) \
+    M(Bool, insert_distributed_one_random_shard, false, "If setting is enabled, inserting into distributed table will choose a random shard to write when there is no sharding key", 0) \
+

 // End of FORMAT_FACTORY_SETTINGS
 // Please add settings non-related to formats into the COMMON_SETTINGS above.
--- a/src/Core/SortDescription.cpp
+++ b/src/Core/SortDescription.cpp
@ -37,5 +37,12 @@ void dumpSortDescription(const SortDescription & description, const Block & head
    }
 }

+std::string dumpSortDescription(const SortDescription & description)
+{
+    WriteBufferFromOwnString wb;
+    dumpSortDescription(description, Block{}, wb);
+    return wb.str();
+}
+
 }

--- a/src/Core/SortDescription.h
+++ b/src/Core/SortDescription.h
@ -72,4 +72,6 @@ class Block;
 /// Outputs user-readable description into `out`.
 void dumpSortDescription(const SortDescription & description, const Block & header, WriteBuffer & out);

+std::string dumpSortDescription(const SortDescription & description);
+
 }
--- a/src/DataStreams/NativeBlockInputStream.cpp
+++ b/src/DataStreams/NativeBlockInputStream.cpp
@ -106,7 +106,7 @@ Block NativeBlockInputStream::readImpl()
    if (istr.eof())
    {
        if (use_index)
-            throw Exception("Input doesn't contain all data for index.", ErrorCodes::CANNOT_READ_ALL_DATA);
+            throw ParsingException("Input doesn't contain all data for index.", ErrorCodes::CANNOT_READ_ALL_DATA);

        return res;
    }
--- a/src/DataStreams/ParallelParsingBlockInputStream.cpp
+++ b/src/DataStreams/ParallelParsingBlockInputStream.cpp
@ -126,8 +126,11 @@ void ParallelParsingBlockInputStream::segmentatorThreadFunction(ThreadGroupStatu
            // Segmentating the original input.
            unit.segment.resize(0);

-            const bool have_more_data = file_segmentation_engine(original_buffer,
-                unit.segment, min_chunk_bytes);
+            auto [have_more_data, currently_read_rows] = file_segmentation_engine(
+                original_buffer, unit.segment, min_chunk_bytes);
+
+            unit.offset = successfully_read_rows_count;
+            successfully_read_rows_count += currently_read_rows;

            unit.is_last = !have_more_data;
            unit.status = READY_TO_PARSE;
@ -142,7 +145,7 @@ void ParallelParsingBlockInputStream::segmentatorThreadFunction(ThreadGroupStatu
    }
    catch (...)
    {
-        onBackgroundException();
+        onBackgroundException(successfully_read_rows_count);
    }
 }

@ -157,11 +160,11 @@ void ParallelParsingBlockInputStream::parserThreadFunction(ThreadGroupStatusPtr

    setThreadName("ChunkParser");

+    const auto current_unit_number = current_ticket_number % processing_units.size();
+    auto & unit = processing_units[current_unit_number];
+
    try
    {
-        const auto current_unit_number = current_ticket_number % processing_units.size();
-        auto & unit = processing_units[current_unit_number];
-
        /*
         * This is kind of suspicious -- the input_process_creator contract with
         * respect to multithreaded use is not clear, but we hope that it is
@ -195,19 +198,22 @@ void ParallelParsingBlockInputStream::parserThreadFunction(ThreadGroupStatusPtr
    }
    catch (...)
    {
-        onBackgroundException();
+        onBackgroundException(unit.offset);
    }
 }

-void ParallelParsingBlockInputStream::onBackgroundException()
+void ParallelParsingBlockInputStream::onBackgroundException(size_t offset)
 {
-    tryLogCurrentException(__PRETTY_FUNCTION__);
-
    std::unique_lock<std::mutex> lock(mutex);
    if (!background_exception)
    {
        background_exception = std::current_exception();
+
+        if (ParsingException * e = exception_cast<ParsingException *>(background_exception))
+            if (e->getLineNumber() != -1)
+                e->setLineNumber(e->getLineNumber() + offset);
    }
+    tryLogCurrentException(__PRETTY_FUNCTION__);
    finished = true;
    reader_condvar.notify_all();
    segmentator_condvar.notify_all();
--- a/src/DataStreams/ParallelParsingBlockInputStream.h
+++ b/src/DataStreams/ParallelParsingBlockInputStream.h
@ -149,6 +149,8 @@ private:
        BlockExt block_ext;
        Memory<> segment;
        std::atomic<ProcessingUnitStatus> status;
+        /// Needed for better exception message.
+        size_t offset = 0;
        bool is_last{false};
    };

@ -159,6 +161,10 @@ private:
    std::deque<ProcessingUnit> processing_units;


+    /// Compute it to have a more understandable error message.
+    size_t successfully_read_rows_count{0};
+
+
    void scheduleParserThreadForUnitWithNumber(size_t ticket_number);
    void finishAndWait();

@ -169,7 +175,7 @@ private:
    // threads. This function is used by segmentator and parsed threads.
    // readImpl() is called from the main thread, so the exception handling
    // is different.
-    void onBackgroundException();
+    void onBackgroundException(size_t offset);
 };

 }
--- a/src/DataStreams/RemoteQueryExecutor.cpp
+++ b/src/DataStreams/RemoteQueryExecutor.cpp
@ -1,4 +1,5 @@
 #include <DataStreams/RemoteQueryExecutor.h>
+#include <DataStreams/RemoteQueryExecutorReadContext.h>

 #include <Columns/ColumnConst.h>
 #include <Common/CurrentThread.h>
@ -11,6 +12,7 @@
 #include <Interpreters/Context.h>
 #include <Interpreters/InternalTextLogsQueue.h>
 #include <IO/ConnectionTimeoutsContext.h>
+#include <Common/FiberStack.h>

 namespace DB
 {
@ -192,68 +194,119 @@ Block RemoteQueryExecutor::read()

        Packet packet = multiplexed_connections->receivePacket();

-        switch (packet.type)
-        {
-            case Protocol::Server::Data:
-                /// If the block is not empty and is not a header block
-                if (packet.block && (packet.block.rows() > 0))
-                    return adaptBlockStructure(packet.block, header);
-                break;  /// If the block is empty - we will receive other packets before EndOfStream.
-
-            case Protocol::Server::Exception:
-                got_exception_from_replica = true;
-                packet.exception->rethrow();
-                break;
-
-            case Protocol::Server::EndOfStream:
-                if (!multiplexed_connections->hasActiveConnections())
-                {
-                    finished = true;
-                    return Block();
-                }
-                break;
-
-            case Protocol::Server::Progress:
-                /** We use the progress from a remote server.
-                  * We also include in ProcessList,
-                  * and we use it to check
-                  * constraints (for example, the minimum speed of query execution)
-                  * and quotas (for example, the number of lines to read).
-                  */
-                if (progress_callback)
-                    progress_callback(packet.progress);
-                break;
-
-            case Protocol::Server::ProfileInfo:
-                /// Use own (client-side) info about read bytes, it is more correct info than server-side one.
-                if (profile_info_callback)
-                    profile_info_callback(packet.profile_info);
-                break;
-
-            case Protocol::Server::Totals:
-                totals = packet.block;
-                break;
-
-            case Protocol::Server::Extremes:
-                extremes = packet.block;
-                break;
-
-            case Protocol::Server::Log:
-                /// Pass logs from remote server to client
-                if (auto log_queue = CurrentThread::getInternalTextLogsQueue())
-                    log_queue->pushBlock(std::move(packet.block));
-                break;
-
-            default:
-                got_unknown_packet_from_replica = true;
-                throw Exception(ErrorCodes::UNKNOWN_PACKET_FROM_SERVER, "Unknown packet {} from one of the following replicas: {}",
-                    toString(packet.type),
-                    multiplexed_connections->dumpAddresses());
-        }
+        if (auto block = processPacket(std::move(packet)))
+            return *block;
    }
 }

-void RemoteQueryExecutor::finish()
+std::variant<Block, int> RemoteQueryExecutor::read(std::unique_ptr<ReadContext> & read_context [[maybe_unused]])
+{
+
+#if defined(OS_LINUX)
+    if (!sent_query)
+    {
+        sendQuery();
+
+        if (context.getSettingsRef().skip_unavailable_shards && (0 == multiplexed_connections->size()))
+            return Block();
+    }
+
+    if (!read_context)
+    {
+        std::lock_guard lock(was_cancelled_mutex);
+        if (was_cancelled)
+            return Block();
+
+        read_context = std::make_unique<ReadContext>(*multiplexed_connections);
+    }
+
+    do
+    {
+        if (!read_context->resumeRoutine())
+            return Block();
+
+        if (read_context->is_read_in_progress)
+        {
+            read_context->setTimer();
+            return read_context->epoll_fd;
+        }
+        else
+        {
+            if (auto data = processPacket(std::move(read_context->packet)))
+                return std::move(*data);
+        }
+    }
+    while (true);
+#else
+    return read();
+#endif
+}
+
+std::optional<Block> RemoteQueryExecutor::processPacket(Packet packet)
+{
+    switch (packet.type)
+    {
+        case Protocol::Server::Data:
+            /// If the block is not empty and is not a header block
+            if (packet.block && (packet.block.rows() > 0))
+                return adaptBlockStructure(packet.block, header);
+            break;  /// If the block is empty - we will receive other packets before EndOfStream.
+
+        case Protocol::Server::Exception:
+            got_exception_from_replica = true;
+            packet.exception->rethrow();
+            break;
+
+        case Protocol::Server::EndOfStream:
+            if (!multiplexed_connections->hasActiveConnections())
+            {
+                finished = true;
+                return Block();
+            }
+            break;
+
+        case Protocol::Server::Progress:
+            /** We use the progress from a remote server.
+              * We also include in ProcessList,
+              * and we use it to check
+              * constraints (for example, the minimum speed of query execution)
+              * and quotas (for example, the number of lines to read).
+              */
+            if (progress_callback)
+                progress_callback(packet.progress);
+            break;
+
+        case Protocol::Server::ProfileInfo:
+            /// Use own (client-side) info about read bytes, it is more correct info than server-side one.
+            if (profile_info_callback)
+                profile_info_callback(packet.profile_info);
+            break;
+
+        case Protocol::Server::Totals:
+            totals = packet.block;
+            break;
+
+        case Protocol::Server::Extremes:
+            extremes = packet.block;
+            break;
+
+        case Protocol::Server::Log:
+            /// Pass logs from remote server to client
+            if (auto log_queue = CurrentThread::getInternalTextLogsQueue())
+                log_queue->pushBlock(std::move(packet.block));
+            break;
+
+        default:
+            got_unknown_packet_from_replica = true;
+            throw Exception(ErrorCodes::UNKNOWN_PACKET_FROM_SERVER, "Unknown packet {} from one of the following replicas: {}",
+                toString(packet.type),
+                multiplexed_connections->dumpAddresses());
+    }
+
+    return {};
+}
+
+void RemoteQueryExecutor::finish(std::unique_ptr<ReadContext> * read_context)
 {
    /** If one of:
      * - nothing started to do;
@ -270,7 +323,7 @@ void RemoteQueryExecutor::finish()
      */

    /// Send the request to abort the execution of the request, if not already sent.
-    tryCancel("Cancelling query because enough data has been read");
+    tryCancel("Cancelling query because enough data has been read", read_context);

    /// Get the remaining packets so that there is no out of sync in the connections to the replicas.
    Packet packet = multiplexed_connections->drain();
@ -299,7 +352,7 @@ void RemoteQueryExecutor::finish()
    }
 }

-void RemoteQueryExecutor::cancel()
+void RemoteQueryExecutor::cancel(std::unique_ptr<ReadContext> * read_context)
 {
    {
        std::lock_guard lock(external_tables_mutex);
@ -313,7 +366,7 @@ void RemoteQueryExecutor::cancel()
    if (!isQueryPending() || hasThrownException())
        return;

-    tryCancel("Cancelling query");
+    tryCancel("Cancelling query", read_context);
 }

 void RemoteQueryExecutor::sendScalars()
@ -365,7 +418,7 @@ void RemoteQueryExecutor::sendExternalTables()
    multiplexed_connections->sendExternalTablesData(external_tables_data);
 }

-void RemoteQueryExecutor::tryCancel(const char * reason)
+void RemoteQueryExecutor::tryCancel(const char * reason, std::unique_ptr<ReadContext> * read_context)
 {
    {
        /// Flag was_cancelled is atomic because it is checked in read().
@ -375,6 +428,10 @@ void RemoteQueryExecutor::tryCancel(const char * reason)
            return;

        was_cancelled = true;
+
+        if (read_context && *read_context)
+            (*read_context)->cancel();
+
        multiplexed_connections->sendCancel();
    }

--- a/src/DataStreams/RemoteQueryExecutor.h
+++ b/src/DataStreams/RemoteQueryExecutor.h
@ -5,6 +5,9 @@
 #include <Storages/IStorage_fwd.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/StorageID.h>
+#include <Common/FiberStack.h>
+#include <Common/TimerDescriptor.h>
+#include <variant>

 namespace DB
 {
@ -20,10 +23,14 @@ using ProgressCallback = std::function<void(const Progress & progress)>;
 struct BlockStreamProfileInfo;
 using ProfileInfoCallback = std::function<void(const BlockStreamProfileInfo & info)>;

+class RemoteQueryExecutorReadContext;
+
 /// This class allows one to launch queries on remote replicas of one shard and get results
 class RemoteQueryExecutor
 {
 public:
+    using ReadContext = RemoteQueryExecutorReadContext;
+
    /// Takes already set connection.
    RemoteQueryExecutor(
        Connection & connection,
@ -53,13 +60,17 @@ public:
    /// Read next block of data. Returns empty block if query is finished.
    Block read();

+    /// Async variant of read. Returns ready block or file descriptor which may be used for polling.
+    /// ReadContext is an internal read state. Pass empty ptr first time, reuse created one for every call.
+    std::variant<Block, int> read(std::unique_ptr<ReadContext> & read_context);
+
    /// Receive all remain packets and finish query.
    /// It should be cancelled after read returned empty block.
-    void finish();
+    void finish(std::unique_ptr<ReadContext> * read_context = nullptr);

    /// Cancel query execution. Sends Cancel packet and ignore others.
    /// This method may be called from separate thread.
-    void cancel();
+    void cancel(std::unique_ptr<ReadContext> * read_context = nullptr);

    /// Get totals and extremes if any.
    Block getTotals() { return std::move(totals); }
@ -153,13 +164,16 @@ private:
    void sendExternalTables();

    /// If wasn't sent yet, send request to cancel all connections to replicas
-    void tryCancel(const char * reason);
+    void tryCancel(const char * reason, std::unique_ptr<ReadContext> * read_context);

    /// Returns true if query was sent
    bool isQueryPending() const;

    /// Returns true if exception was thrown
    bool hasThrownException() const;
+
+    /// Process packet for read and return data block if possible.
+    std::optional<Block> processPacket(Packet packet);
 };

 }
--- a/src/DataStreams/RemoteQueryExecutorReadContext.h
+++ b/src/DataStreams/RemoteQueryExecutorReadContext.h
@ -0,0 +1,272 @@
+#pragma once
+
+#if defined(OS_LINUX)
+
+#include <sys/epoll.h>
+#include <Common/Fiber.h>
+#include <Common/FiberStack.h>
+#include <Common/TimerDescriptor.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int CANNOT_READ_FROM_SOCKET;
+    extern const int CANNOT_OPEN_FILE;
+    extern const int SOCKET_TIMEOUT;
+}
+
+class RemoteQueryExecutorReadContext
+{
+public:
+    using Self = RemoteQueryExecutorReadContext;
+
+    bool is_read_in_progress = false;
+    Packet packet;
+
+    std::exception_ptr exception;
+    FiberStack stack;
+    boost::context::fiber fiber;
+    /// This mutex for fiber is needed because fiber could be destroyed in cancel method from another thread.
+    std::mutex fiber_lock;
+
+    Poco::Timespan receive_timeout;
+    MultiplexedConnections & connections;
+    Poco::Net::Socket * last_used_socket = nullptr;
+
+    /// Here we have three descriptors we are going to wait:
+    /// * socket_fd is a descriptor of connection. It may be changed in case of reading from several replicas.
+    /// * timer is a timerfd descriptor to manually check socket timeout
+    /// * pipe_fd is a pipe we use to cancel query and socket polling by executor.
+    /// We put those descriptors into our own epoll_fd which is used by external executor.
+    TimerDescriptor timer{CLOCK_MONOTONIC, 0};
+    int socket_fd = -1;
+    int epoll_fd;
+    int pipe_fd[2];
+
+    explicit RemoteQueryExecutorReadContext(MultiplexedConnections & connections_) : connections(connections_)
+    {
+        epoll_fd = epoll_create(2);
+        if (-1 == epoll_fd)
+            throwFromErrno("Cannot create epoll descriptor", ErrorCodes::CANNOT_OPEN_FILE);
+
+        if (-1 == pipe2(pipe_fd, O_NONBLOCK))
+            throwFromErrno("Cannot create pipe", ErrorCodes::CANNOT_OPEN_FILE);
+
+        {
+            epoll_event socket_event;
+            socket_event.events = EPOLLIN | EPOLLPRI;
+            socket_event.data.fd = pipe_fd[0];
+
+            if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, pipe_fd[0], &socket_event))
+                throwFromErrno("Cannot add pipe descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE);
+        }
+
+        {
+            epoll_event timer_event;
+            timer_event.events = EPOLLIN | EPOLLPRI;
+            timer_event.data.fd = timer.getDescriptor();
+
+            if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, timer_event.data.fd, &timer_event))
+                throwFromErrno("Cannot add timer descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE);
+        }
+
+        auto routine = Routine{connections, *this};
+        fiber = boost::context::fiber(std::allocator_arg_t(), stack, std::move(routine));
+    }
+
+    void setSocket(Poco::Net::Socket & socket)
+    {
+        int fd = socket.impl()->sockfd();
+        if (fd == socket_fd)
+            return;
+
+        epoll_event socket_event;
+        socket_event.events = EPOLLIN | EPOLLPRI;
+        socket_event.data.fd = fd;
+
+        if (socket_fd != -1)
+        {
+            if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_DEL, socket_fd, &socket_event))
+                throwFromErrno("Cannot remove socket descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE);
+        }
+
+        socket_fd = fd;
+
+        if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, socket_fd, &socket_event))
+            throwFromErrno("Cannot add socket descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE);
+
+        receive_timeout = socket.impl()->getReceiveTimeout();
+    }
+
+    bool checkTimeout() const
+    {
+        try
+        {
+            return checkTimeoutImpl();
+        }
+        catch (DB::Exception & e)
+        {
+            if (last_used_socket)
+                e.addMessage(" while reading from socket ({})", last_used_socket->peerAddress().toString());
+            throw;
+        }
+    }
+
+    bool checkTimeoutImpl() const
+    {
+        epoll_event events[3];
+        events[0].data.fd = events[1].data.fd = events[2].data.fd = -1;
+
+        /// Wait for epoll_fd will not block if it was polled externally.
+        int num_events = epoll_wait(epoll_fd, events, 3, 0);
+        if (num_events == -1)
+            throwFromErrno("Failed to epoll_wait", ErrorCodes::CANNOT_READ_FROM_SOCKET);
+
+        bool is_socket_ready = false;
+        bool is_pipe_alarmed = false;
+        bool has_timer_alarm = false;
+
+        for (int i = 0; i < num_events; ++i)
+        {
+            if (events[i].data.fd == socket_fd)
+                is_socket_ready = true;
+            if (events[i].data.fd == timer.getDescriptor())
+                has_timer_alarm = true;
+            if (events[i].data.fd == pipe_fd[0])
+                is_pipe_alarmed = true;
+        }
+
+        if (is_pipe_alarmed)
+            return false;
+
+        if (has_timer_alarm && !is_socket_ready)
+        {
+            /// Socket receive timeout. Drain it in case or error, or it may be hide by timeout exception.
+            timer.drain();
+            throw NetException("Timeout exceeded", ErrorCodes::SOCKET_TIMEOUT);
+        }
+
+        return true;
+    }
+
+    void setTimer() const
+    {
+        /// Did not get packet yet. Init timeout for the next async reading.
+        timer.reset();
+
+        if (receive_timeout.totalMicroseconds())
+            timer.setRelative(receive_timeout);
+    }
+
+    bool resumeRoutine()
+    {
+        if (is_read_in_progress && !checkTimeout())
+            return false;
+
+        {
+            std::lock_guard guard(fiber_lock);
+            if (!fiber)
+                return false;
+
+            fiber = std::move(fiber).resume();
+        }
+
+        if (exception)
+            std::rethrow_exception(std::move(exception));
+
+        return true;
+    }
+
+    void cancel()
+    {
+        std::lock_guard guard(fiber_lock);
+        /// It is safe to just destroy fiber - we are not in the process of reading from socket.
+        boost::context::fiber to_destroy = std::move(fiber);
+
+        /// Send something to pipe to cancel executor waiting.
+        uint64_t buf = 0;
+        while (-1 == write(pipe_fd[1], &buf, sizeof(buf)))
+        {
+            if (errno == EAGAIN)
+                break;
+
+            if (errno != EINTR)
+                throwFromErrno("Cannot write to pipe", ErrorCodes::CANNOT_READ_FROM_SOCKET);
+        }
+    }
+
+    ~RemoteQueryExecutorReadContext()
+    {
+        /// socket_fd is closed by Poco::Net::Socket
+        /// timer_fd is closed by TimerDescriptor
+        close(epoll_fd);
+    }
+
+    struct Routine
+    {
+        MultiplexedConnections & connections;
+        Self & read_context;
+
+        struct ReadCallback
+        {
+            Self & read_context;
+            Fiber & fiber;
+
+            void operator()(Poco::Net::Socket & socket)
+            {
+                try
+                {
+                    read_context.setSocket(socket);
+                }
+                catch (DB::Exception & e)
+                {
+                    e.addMessage(" while reading from socket ({})", socket.peerAddress().toString());
+                    throw;
+                }
+
+                read_context.is_read_in_progress = true;
+                fiber = std::move(fiber).resume();
+                read_context.is_read_in_progress = false;
+            }
+        };
+
+        Fiber operator()(Fiber && sink) const
+        {
+            try
+            {
+                while (true)
+                {
+                    read_context.packet = connections.receivePacketUnlocked(ReadCallback{read_context, sink});
+                    sink = std::move(sink).resume();
+                }
+            }
+            catch (const boost::context::detail::forced_unwind &)
+            {
+                /// This exception is thrown by fiber implementation in case if fiber is being deleted but hasn't exited
+                /// It should not be caught or it will segfault.
+                /// Other exceptions must be caught
+                throw;
+            }
+            catch (...)
+            {
+                read_context.exception = std::current_exception();
+            }
+
+            return std::move(sink);
+        }
+    };
+};
+}
+#else
+namespace DB
+{
+class RemoteQueryExecutorReadContext
+{
+public:
+    void cancel() {}
+};
+
+}
+#endif
--- a/src/DataStreams/ya.make
+++ b/src/DataStreams/ya.make
@ -6,6 +6,7 @@ LIBRARY()
 PEERDIR(
    clickhouse/src/Common
    contrib/libs/poco/MongoDB
+    contrib/restricted/boost/libs
 )

 NO_COMPILER_WARNINGS()
--- a/src/DataStreams/ya.make.in
+++ b/src/DataStreams/ya.make.in
@ -5,6 +5,7 @@ LIBRARY()
 PEERDIR(
    clickhouse/src/Common
    contrib/libs/poco/MongoDB
+    contrib/restricted/boost/libs
 )

 NO_COMPILER_WARNINGS()
--- a/src/DataTypes/DataTypeArray.cpp
+++ b/src/DataTypes/DataTypeArray.cpp
@ -272,7 +272,7 @@ void DataTypeArray::deserializeBinaryBulkWithMultipleStreams(
    /// Check consistency between offsets and elements subcolumns.
    /// But if elements column is empty - it's ok for columns of Nested types that was added by ALTER.
    if (!nested_column.empty() && nested_column.size() != last_offset)
-        throw Exception("Cannot read all array values: read just " + toString(nested_column.size()) + " of " + toString(last_offset),
+        throw ParsingException("Cannot read all array values: read just " + toString(nested_column.size()) + " of " + toString(last_offset),
            ErrorCodes::CANNOT_READ_ALL_DATA);
 }

@ -300,7 +300,7 @@ static void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffe


 template <typename Reader>
-static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && read_nested)
+static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && read_nested, bool allow_unenclosed)
 {
    ColumnArray & column_array = assert_cast<ColumnArray &>(column);
    ColumnArray::Offsets & offsets = column_array.getOffsets();
@ -308,7 +308,12 @@ static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && r
    IColumn & nested_column = column_array.getData();

    size_t size = 0;
-    assertChar('[', istr);
+
+    bool has_braces = false;
+    if (checkChar('[', istr))
+        has_braces = true;
+    else if (!allow_unenclosed)
+        throw Exception(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, "Array does not start with '[' character");

    try
    {
@ -320,7 +325,9 @@ static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && r
                if (*istr.position() == ',')
                    ++istr.position();
                else
-                    throw Exception("Cannot read array from text", ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT);
+                    throw ParsingException(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT,
+                        "Cannot read array from text, expected comma or end of array, found '{}'",
+                        *istr.position());
            }

            first = false;
@ -335,7 +342,11 @@ static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && r

            skipWhitespaceIfAny(istr);
        }
-        assertChar(']', istr);
+
+        if (has_braces)
+            assertChar(']', istr);
+        else /// If array is not enclosed in braces, we read until EOF.
+            assertEOF(istr);
    }
    catch (...)
    {
@ -364,7 +375,7 @@ void DataTypeArray::deserializeText(IColumn & column, ReadBuffer & istr, const F
        [&](IColumn & nested_column)
        {
            nested->deserializeAsTextQuoted(nested_column, istr, settings);
-        });
+        }, false);
 }

 void DataTypeArray::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
@ -390,7 +401,11 @@ void DataTypeArray::serializeTextJSON(const IColumn & column, size_t row_num, Wr

 void DataTypeArray::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
 {
-    deserializeTextImpl(column, istr, [&](IColumn & nested_column) { nested->deserializeAsTextJSON(nested_column, istr, settings); });
+    deserializeTextImpl(column, istr,
+        [&](IColumn & nested_column)
+        {
+            nested->deserializeAsTextJSON(nested_column, istr, settings);
+        }, false);
 }


@ -429,7 +444,23 @@ void DataTypeArray::deserializeTextCSV(IColumn & column, ReadBuffer & istr, cons
    String s;
    readCSV(s, istr, settings.csv);
    ReadBufferFromString rb(s);
-    deserializeText(column, rb, settings);
+
+    if (settings.csv.input_format_arrays_as_nested_csv)
+    {
+        deserializeTextImpl(column, rb,
+            [&](IColumn & nested_column)
+            {
+                nested->deserializeAsTextCSV(nested_column, rb, settings);
+            }, true);
+    }
+    else
+    {
+        deserializeTextImpl(column, rb,
+            [&](IColumn & nested_column)
+            {
+                nested->deserializeAsTextQuoted(nested_column, rb, settings);
+            }, true);
+    }
 }


--- a/src/DataTypes/DataTypeNullable.cpp
+++ b/src/DataTypes/DataTypeNullable.cpp
@ -235,7 +235,7 @@ ReturnType DataTypeNullable::deserializeTextEscaped(IColumn & column, ReadBuffer
    /// Little tricky, because we cannot discriminate null from first character.

    if (istr.eof())
-        throw Exception("Unexpected end of stream, while parsing value of Nullable type", ErrorCodes::CANNOT_READ_ALL_DATA);
+        throw ParsingException("Unexpected end of stream, while parsing value of Nullable type", ErrorCodes::CANNOT_READ_ALL_DATA);

    /// This is not null, surely.
    if (*istr.position() != '\\')
@ -250,7 +250,7 @@ ReturnType DataTypeNullable::deserializeTextEscaped(IColumn & column, ReadBuffer
        ++istr.position();

        if (istr.eof())
-            throw Exception("Unexpected end of stream, while parsing value of Nullable type, after backslash", ErrorCodes::CANNOT_READ_ALL_DATA);
+            throw ParsingException("Unexpected end of stream, while parsing value of Nullable type, after backslash", ErrorCodes::CANNOT_READ_ALL_DATA);

        return safeDeserialize<ReturnType>(column, *nested_data_type,
            [&istr]
@ -405,11 +405,11 @@ ReturnType DataTypeNullable::deserializeTextCSV(IColumn & column, ReadBuffer & i
                /// or if someone uses 'U' or 'L' as delimiter in CSV.
                /// In the first case we cannot continue reading anyway. The second case seems to be unlikely.
                if (settings.csv.delimiter == 'U' || settings.csv.delimiter == 'L')
-                    throw DB::Exception("Enabled setting input_format_csv_unquoted_null_literal_as_null may not work correctly "
+                    throw DB::ParsingException("Enabled setting input_format_csv_unquoted_null_literal_as_null may not work correctly "
                                        "with format_csv_delimiter = 'U' or 'L' for large input.", ErrorCodes::CANNOT_READ_ALL_DATA);
                WriteBufferFromOwnString parsed_value;
                nested_data_type->serializeAsTextCSV(nested, nested.size() - 1, parsed_value, settings);
-                throw DB::Exception("Error while parsing \"" + std::string(null_literal, null_prefix_len)
+                throw DB::ParsingException("Error while parsing \"" + std::string(null_literal, null_prefix_len)
                                    + std::string(istr.position(), std::min(size_t{10}, istr.available())) + "\" as Nullable(" + nested_data_type->getName()
                                    + ") at position " + std::to_string(istr.count()) + ": expected \"NULL\" or " + nested_data_type->getName()
                                    + ", got \"" + std::string(null_literal, buf.count()) + "\", which was deserialized as \""
--- a/src/Dictionaries/ExternalQueryBuilder.h
+++ b/src/Dictionaries/ExternalQueryBuilder.h
@ -2,6 +2,7 @@

 #include <string>
 #include <Columns/IColumn.h>
+#include <Dictionaries/DictionaryStructure.h>
 #include <Formats/FormatSettings.h>
 #include <Parsers/IdentifierQuotingStyle.h>

@ -16,11 +17,11 @@ class WriteBuffer;
  */
 struct ExternalQueryBuilder
 {
-    const DictionaryStructure & dict_struct;
-    std::string db;
-    std::string schema;
-    std::string table;
-    const std::string & where;
+    const DictionaryStructure dict_struct;
+    const std::string db;
+    const std::string schema;
+    const std::string table;
+    const std::string where;

    IdentifierQuotingStyle quoting_style;

--- a/src/Formats/FormatFactory.cpp
+++ b/src/Formats/FormatFactory.cpp
@ -64,6 +64,7 @@ FormatSettings getFormatSettings(const Context & context,
    format_settings.csv.empty_as_default = settings.input_format_defaults_for_omitted_fields;
    format_settings.csv.input_format_enum_as_number = settings.input_format_csv_enum_as_number;
    format_settings.csv.unquoted_null_literal_as_null = settings.input_format_csv_unquoted_null_literal_as_null;
+    format_settings.csv.input_format_arrays_as_nested_csv = settings.input_format_csv_arrays_as_nested_csv;
    format_settings.custom.escaping_rule = settings.format_custom_escaping_rule;
    format_settings.custom.field_delimiter = settings.format_custom_field_delimiter;
    format_settings.custom.result_after_delimiter = settings.format_custom_result_after_delimiter;
@ -162,7 +163,7 @@ BlockInputStreamPtr FormatFactory::getInput(
    // (segmentator + two parsers + reader).
    bool parallel_parsing = settings.input_format_parallel_parsing && file_segmentation_engine && settings.max_threads >= 4;

-    if (settings.min_chunk_bytes_for_parallel_parsing * settings.max_threads * 2 > settings.max_memory_usage)
+    if (settings.max_memory_usage && settings.min_chunk_bytes_for_parallel_parsing * settings.max_threads * 2 > settings.max_memory_usage)
        parallel_parsing = false;

    if (parallel_parsing && name == "JSONEachRow")
@ -258,7 +259,6 @@ InputFormatPtr FormatFactory::getInputFormat(

    auto format = input_getter(buf, sample, params, format_settings);

-
    /// It's a kludge. Because I cannot remove context from values format.
    if (auto * values = typeid_cast<ValuesBlockInputFormat *>(format.get()))
        values->setContext(context);
--- a/src/Formats/FormatFactory.h
+++ b/src/Formats/FormatFactory.h
@ -54,7 +54,7 @@ public:
      * Reads at least min_chunk_bytes and some more until the end of the chunk, depends on the format.
      * Used in ParallelParsingBlockInputStream.
      */
-    using FileSegmentationEngine = std::function<bool(
+    using FileSegmentationEngine = std::function<std::pair<bool, size_t>(
        ReadBuffer & buf,
        DB::Memory<> & memory,
        size_t min_chunk_bytes)>;
--- a/src/Formats/FormatSettings.h
+++ b/src/Formats/FormatSettings.h
@ -71,6 +71,7 @@ struct FormatSettings
        bool empty_as_default = false;
        bool crlf_end_of_line = false;
        bool input_format_enum_as_number = false;
+        bool input_format_arrays_as_nested_csv = false;
    } csv;

    struct Custom
--- a/src/Formats/JSONEachRowUtils.cpp
+++ b/src/Formats/JSONEachRowUtils.cpp
@ -4,13 +4,14 @@
 namespace DB
 {

-bool fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size)
+std::pair<bool, size_t> fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size)
 {
    skipWhitespaceIfAny(in);

    char * pos = in.position();
    size_t balance = 0;
    bool quotes = false;
+    size_t number_of_rows = 0;

    while (loadAtPosition(in, memory, pos) && (balance || memory.size() + static_cast<size_t>(pos - in.position()) < min_chunk_size))
    {
@ -57,11 +58,14 @@ bool fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, DB::Memory<> & memor
                quotes = true;
                ++pos;
            }
+
+            if (balance == 0)
+                ++number_of_rows;
        }
    }

    saveUpToPosition(in, memory, pos);
-    return loadAtPosition(in, memory, pos);
+    return {loadAtPosition(in, memory, pos), number_of_rows};
 }

 }
--- a/src/Formats/JSONEachRowUtils.h
+++ b/src/Formats/JSONEachRowUtils.h
@ -3,6 +3,6 @@
 namespace DB
 {

-bool fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size);
+std::pair<bool, size_t> fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size);

 }
--- a/Show More
+++ b/Show More