diff --git a/base/common/defines.h b/base/common/defines.h index 6dc61155649..1029becb971 100644 --- a/base/common/defines.h +++ b/base/common/defines.h @@ -61,6 +61,16 @@ # endif #endif +#if defined(ADDRESS_SANITIZER) +# define BOOST_USE_ASAN 1 +# define BOOST_USE_UCONTEXT 1 +#endif + +#if defined(THREAD_SANITIZER) +# define BOOST_USE_TSAN 1 +# define BOOST_USE_UCONTEXT 1 +#endif + /// TODO: Strange enough, there is no way to detect UB sanitizer. /// Explicitly allow undefined behaviour for certain functions. Use it as a function attribute. diff --git a/base/daemon/BaseDaemon.cpp b/base/daemon/BaseDaemon.cpp index 331f9da56dd..f25bcdb91e1 100644 --- a/base/daemon/BaseDaemon.cpp +++ b/base/daemon/BaseDaemon.cpp @@ -4,6 +4,11 @@ #include #include #include +#include +#include +#if defined(__linux__) + #include +#endif #include #include #include @@ -12,7 +17,6 @@ #include #include -#include #include #include #include @@ -22,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -470,7 +473,6 @@ BaseDaemon::~BaseDaemon() void BaseDaemon::terminate() { - getTaskManager().cancelAll(); if (::raise(SIGTERM) != 0) throw Poco::SystemException("cannot terminate process"); } @@ -478,22 +480,11 @@ void BaseDaemon::terminate() void BaseDaemon::kill() { dumpCoverageReportIfPossible(); - pid.reset(); + pid_file.reset(); if (::raise(SIGKILL) != 0) throw Poco::SystemException("cannot kill process"); } -void BaseDaemon::sleep(double seconds) -{ - wakeup_event.reset(); - wakeup_event.tryWait(seconds * 1000); -} - -void BaseDaemon::wakeup() -{ - wakeup_event.set(); -} - std::string BaseDaemon::getDefaultCorePath() const { return "/opt/cores/"; @@ -564,7 +555,6 @@ void BaseDaemon::initialize(Application & self) { closeFDs(); - task_manager = std::make_unique(); ServerApplication::initialize(self); /// now highest priority (lowest value) is PRIO_APPLICATION = -100, we want higher! @@ -648,10 +638,6 @@ void BaseDaemon::initialize(Application & self) throw Poco::OpenFileException("Cannot attach stdout to " + stdout_path); } - /// Create pid file. - if (config().has("pid")) - pid.emplace(config().getString("pid"), DB::StatusFile::write_pid); - /// Change path for logging. if (!log_path.empty()) { @@ -667,9 +653,17 @@ void BaseDaemon::initialize(Application & self) throw Poco::Exception("Cannot change directory to /tmp"); } - // sensitive data masking rules are not used here + /// sensitive data masking rules are not used here buildLoggers(config(), logger(), self.commandName()); + /// After initialized loggers but before initialized signal handling. + if (should_setup_watchdog) + setupWatchdog(); + + /// Create pid file. + if (config().has("pid")) + pid_file.emplace(config().getString("pid"), DB::StatusFile::write_pid); + if (is_daemon) { /** Change working directory to the directory to write core dumps. @@ -704,54 +698,71 @@ void BaseDaemon::initialize(Application & self) } +static void addSignalHandler(const std::vector & signals, signal_function handler, std::vector * out_handled_signals) +{ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = handler; + sa.sa_flags = SA_SIGINFO; + +#if defined(OS_DARWIN) + sigemptyset(&sa.sa_mask); + for (auto signal : signals) + sigaddset(&sa.sa_mask, signal); +#else + if (sigemptyset(&sa.sa_mask)) + throw Poco::Exception("Cannot set signal handler."); + + for (auto signal : signals) + if (sigaddset(&sa.sa_mask, signal)) + throw Poco::Exception("Cannot set signal handler."); +#endif + + for (auto signal : signals) + if (sigaction(signal, &sa, nullptr)) + throw Poco::Exception("Cannot set signal handler."); + + if (out_handled_signals) + std::copy(signals.begin(), signals.end(), std::back_inserter(*out_handled_signals)); +}; + + +static void blockSignals(const std::vector & signals) +{ + sigset_t sig_set; + +#if defined(OS_DARWIN) + sigemptyset(&sig_set); + for (auto signal : signals) + sigaddset(&sig_set, signal); +#else + if (sigemptyset(&sig_set)) + throw Poco::Exception("Cannot block signal."); + + for (auto signal : signals) + if (sigaddset(&sig_set, signal)) + throw Poco::Exception("Cannot block signal."); +#endif + + if (pthread_sigmask(SIG_BLOCK, &sig_set, nullptr)) + throw Poco::Exception("Cannot block signal."); +}; + + void BaseDaemon::initializeTerminationAndSignalProcessing() { SentryWriter::initialize(config()); std::set_terminate(terminate_handler); /// We want to avoid SIGPIPE when working with sockets and pipes, and just handle return value/errno instead. - { - sigset_t sig_set; - if (sigemptyset(&sig_set) || sigaddset(&sig_set, SIGPIPE) || pthread_sigmask(SIG_BLOCK, &sig_set, nullptr)) - throw Poco::Exception("Cannot block signal."); - } + blockSignals({SIGPIPE}); /// Setup signal handlers. - auto add_signal_handler = - [this](const std::vector & signals, signal_function handler) - { - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO; - - { -#if defined(OS_DARWIN) - sigemptyset(&sa.sa_mask); - for (auto signal : signals) - sigaddset(&sa.sa_mask, signal); -#else - if (sigemptyset(&sa.sa_mask)) - throw Poco::Exception("Cannot set signal handler."); - - for (auto signal : signals) - if (sigaddset(&sa.sa_mask, signal)) - throw Poco::Exception("Cannot set signal handler."); -#endif - - for (auto signal : signals) - if (sigaction(signal, &sa, nullptr)) - throw Poco::Exception("Cannot set signal handler."); - - std::copy(signals.begin(), signals.end(), std::back_inserter(handled_signals)); - } - }; - /// SIGTSTP is added for debugging purposes. To output a stack trace of any running thread at anytime. - add_signal_handler({SIGABRT, SIGSEGV, SIGILL, SIGBUS, SIGSYS, SIGFPE, SIGPIPE, SIGTSTP}, signalHandler); - add_signal_handler({SIGHUP, SIGUSR1}, closeLogsSignalHandler); - add_signal_handler({SIGINT, SIGQUIT, SIGTERM}, terminateRequestedSignalHandler); + addSignalHandler({SIGABRT, SIGSEGV, SIGILL, SIGBUS, SIGSYS, SIGFPE, SIGPIPE, SIGTSTP}, signalHandler, &handled_signals); + addSignalHandler({SIGHUP, SIGUSR1}, closeLogsSignalHandler, &handled_signals); + addSignalHandler({SIGINT, SIGQUIT, SIGTERM}, terminateRequestedSignalHandler, &handled_signals); #if defined(SANITIZER) __sanitizer_set_death_callback(sanitizerDeathCallback); @@ -786,23 +797,6 @@ void BaseDaemon::logRevision() const + ", PID " + std::to_string(getpid())); } -/// Makes server shutdown if at least one Poco::Task have failed. -void BaseDaemon::exitOnTaskError() -{ - Poco::Observer obs(*this, &BaseDaemon::handleNotification); - getTaskManager().addObserver(obs); -} - -/// Used for exitOnTaskError() -void BaseDaemon::handleNotification(Poco::TaskFailedNotification *_tfn) -{ - task_failed = true; - Poco::AutoPtr fn(_tfn); - Poco::Logger * lg = &(logger()); - LOG_ERROR(lg, "Task '{}' failed. Daemon is shutting down. Reason - {}", fn->task()->name(), fn->reason().displayText()); - ServerApplication::terminate(); -} - void BaseDaemon::defineOptions(Poco::Util::OptionSet & new_options) { new_options.addOption( @@ -863,13 +857,144 @@ void BaseDaemon::onInterruptSignals(int signal_id) if (sigint_signals_counter >= 2) { LOG_INFO(&logger(), "Received second signal Interrupt. Immediately terminate."); - kill(); + call_default_signal_handler(signal_id); + /// If the above did not help. + _exit(128 + signal_id); } } void BaseDaemon::waitForTerminationRequest() { + /// NOTE: as we already process signals via pipe, we don't have to block them with sigprocmask in threads std::unique_lock lock(signal_handler_mutex); signal_event.wait(lock, [this](){ return terminate_signals_counter > 0; }); } + + +void BaseDaemon::shouldSetupWatchdog(char * argv0_) +{ + should_setup_watchdog = true; + argv0 = argv0_; +} + + +void BaseDaemon::setupWatchdog() +{ + /// Initialize in advance to avoid double initialization in forked processes. + DateLUT::instance(); + + std::string original_process_name; + if (argv0) + original_process_name = argv0; + + while (true) + { + static pid_t pid = -1; + pid = fork(); + + if (-1 == pid) + throw Poco::Exception("Cannot fork"); + + if (0 == pid) + { + logger().information("Forked a child process to watch"); +#if defined(__linux__) + if (0 != prctl(PR_SET_PDEATHSIG, SIGKILL)) + logger().warning("Cannot do prctl to ask termination with parent."); +#endif + return; + } + + /// Change short thread name and process name. + setThreadName("clckhouse-watch"); /// 15 characters + + if (argv0) + { + const char * new_process_name = "clickhouse-watchdog"; + memset(argv0, 0, original_process_name.size()); + memcpy(argv0, new_process_name, std::min(strlen(new_process_name), original_process_name.size())); + } + + logger().information(fmt::format("Will watch for the process with pid {}", pid)); + + /// Forward signals to the child process. + addSignalHandler( + {SIGHUP, SIGUSR1, SIGINT, SIGQUIT, SIGTERM}, + [](int sig, siginfo_t *, void *) + { + /// Forward all signals except INT as it can be send by terminal to the process group when user press Ctrl+C, + /// and we process double delivery of this signal as immediate termination. + if (sig == SIGINT) + return; + + const char * error_message = "Cannot forward signal to the child process.\n"; + if (0 != ::kill(pid, sig)) + { + auto res = write(STDERR_FILENO, error_message, strlen(error_message)); + (void)res; + } + }, + nullptr); + + int status = 0; + do + { + if (-1 != waitpid(pid, &status, WUNTRACED | WCONTINUED) || errno == ECHILD) + { + if (WIFSTOPPED(status)) + logger().warning(fmt::format("Child process was stopped by signal {}.", WSTOPSIG(status))); + else if (WIFCONTINUED(status)) + logger().warning(fmt::format("Child process was continued.")); + else + break; + } + else if (errno != EINTR) + throw Poco::Exception("Cannot waitpid, errno: " + std::string(strerror(errno))); + } while (true); + + if (errno == ECHILD) + { + logger().information("Child process no longer exists."); + _exit(status); + } + + if (WIFEXITED(status)) + { + logger().information(fmt::format("Child process exited normally with code {}.", WEXITSTATUS(status))); + _exit(status); + } + + if (WIFSIGNALED(status)) + { + int sig = WTERMSIG(status); + + if (sig == SIGKILL) + { + logger().fatal(fmt::format("Child process was terminated by signal {} (KILL)." + " If it is not done by 'forcestop' command or manually," + " the possible cause is OOM Killer (see 'dmesg' and look at the '/var/log/kern.log' for the details).", sig)); + } + else + { + logger().fatal(fmt::format("Child process was terminated by signal {}.", sig)); + + if (sig == SIGINT || sig == SIGTERM || sig == SIGQUIT) + _exit(status); + } + } + else + { + logger().fatal("Child process was not exited normally by unknown reason."); + } + + /// Automatic restart is not enabled but you can play with it. +#if 1 + _exit(status); +#else + logger().information("Will restart."); + if (argv0) + memcpy(argv0, original_process_name.c_str(), original_process_name.size()); +#endif + } +} diff --git a/base/daemon/BaseDaemon.h b/base/daemon/BaseDaemon.h index f4d3f3dfe98..090d4997606 100644 --- a/base/daemon/BaseDaemon.h +++ b/base/daemon/BaseDaemon.h @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -26,9 +25,6 @@ #include -namespace Poco { class TaskManager; } - - /// \brief Base class for applications that can run as daemons. /// /// \code @@ -52,31 +48,26 @@ public: BaseDaemon(); ~BaseDaemon() override; - /// Загружает конфигурацию и "строит" логгеры на запись в файлы + /// Load configuration, prepare loggers, etc. void initialize(Poco::Util::Application &) override; - /// Читает конфигурацию void reloadConfiguration(); - /// Определяет параметр командной строки + /// Process command line parameters void defineOptions(Poco::Util::OptionSet & new_options) override; - /// Заставляет демон завершаться, если хотя бы одна задача завершилась неудачно - void exitOnTaskError(); + /// Graceful shutdown + static void terminate(); - /// Завершение демона ("мягкое") - void terminate(); - - /// Завершение демона ("жёсткое") + /// Forceful shutdown void kill(); - /// Получен ли сигнал на завершение? + /// Cancellation request has been received. bool isCancelled() const { return is_cancelled; } - /// Получение ссылки на экземпляр демона static BaseDaemon & instance() { return dynamic_cast(Poco::Util::Application::instance()); @@ -85,12 +76,6 @@ public: /// return none if daemon doesn't exist, reference to the daemon otherwise static std::optional> tryGetInstance() { return tryGetInstance(); } - /// Спит заданное количество секунд или до события wakeup - void sleep(double seconds); - - /// Разбудить - void wakeup(); - /// В Graphite компоненты пути(папки) разделяются точкой. /// У нас принят путь формата root_path.hostname_yandex_ru.key /// root_path по умолчанию one_min @@ -131,24 +116,23 @@ public: /// also doesn't close global internal pipes for signal handling static void closeFDs(); + /// If this method is called after initialization and before run, + /// will fork child process and setup watchdog that will print diagnostic info, if the child terminates. + /// argv0 is needed to change process name (consequently, it is needed for scripts involving "pgrep", "pidof" to work correctly). + void shouldSetupWatchdog(char * argv0_); + protected: - /// Возвращает TaskManager приложения - /// все методы task_manager следует вызывать из одного потока - /// иначе возможен deadlock, т.к. joinAll выполняется под локом, а любой метод тоже берет лок - Poco::TaskManager & getTaskManager() { return *task_manager; } - virtual void logRevision() const; - /// Используется при exitOnTaskError() - void handleNotification(Poco::TaskFailedNotification *); - /// thread safe virtual void handleSignal(int signal_id); /// initialize termination process and signal handlers virtual void initializeTerminationAndSignalProcessing(); - /// реализация обработки сигналов завершения через pipe не требует блокировки сигнала с помощью sigprocmask во всех потоках + /// fork the main process and watch if it was killed + void setupWatchdog(); + void waitForTerminationRequest() #if defined(POCO_CLICKHOUSE_PATCH) || POCO_VERSION >= 0x02000000 // in old upstream poco not vitrual override @@ -162,21 +146,13 @@ protected: virtual std::string getDefaultCorePath() const; - std::unique_ptr task_manager; - - std::optional pid; + std::optional pid_file; std::atomic_bool is_cancelled{false}; - /// Флаг устанавливается по сообщению из Task (при аварийном завершении). - bool task_failed = false; - bool log_to_console = false; - /// Событие, чтобы проснуться во время ожидания - Poco::Event wakeup_event; - - /// Поток, в котором принимается сигнал HUP/USR1 для закрытия логов. + /// A thread that acts on HUP and USR1 signal (close logs). Poco::Thread signal_listener_thread; std::unique_ptr signal_listener; @@ -194,6 +170,9 @@ protected: String build_id_info; std::vector handled_signals; + + bool should_setup_watchdog = false; + char * argv0 = nullptr; }; diff --git a/base/glibc-compatibility/musl/timerfd.c b/base/glibc-compatibility/musl/timerfd.c new file mode 100644 index 00000000000..0f9adb54389 --- /dev/null +++ b/base/glibc-compatibility/musl/timerfd.c @@ -0,0 +1,17 @@ +#include +#include "syscall.h" + +int timerfd_create(int clockid, int flags) +{ + return syscall(SYS_timerfd_create, clockid, flags); +} + +int timerfd_settime(int fd, int flags, const struct itimerspec *new, struct itimerspec *old) +{ + return syscall(SYS_timerfd_settime, fd, flags, new, old); +} + +int timerfd_gettime(int fd, struct itimerspec *cur) +{ + return syscall(SYS_timerfd_gettime, fd, cur); +} diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt index fd860c9f9b0..f7c1ce22e90 100644 --- a/contrib/boost-cmake/CMakeLists.txt +++ b/contrib/boost-cmake/CMakeLists.txt @@ -11,10 +11,11 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY) iostreams program_options regex + context ) if(Boost_INCLUDE_DIR AND Boost_FILESYSTEM_LIBRARY AND Boost_FILESYSTEM_LIBRARY AND - Boost_PROGRAM_OPTIONS_LIBRARY AND Boost_REGEX_LIBRARY AND Boost_SYSTEM_LIBRARY) + Boost_PROGRAM_OPTIONS_LIBRARY AND Boost_REGEX_LIBRARY AND Boost_SYSTEM_LIBRARY AND Boost_CONTEXT_LIBRARY) set(EXTERNAL_BOOST_FOUND 1) @@ -27,18 +28,21 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY) add_library (_boost_program_options INTERFACE) add_library (_boost_regex INTERFACE) add_library (_boost_system INTERFACE) + add_library (_boost_context INTERFACE) target_link_libraries (_boost_filesystem INTERFACE ${Boost_FILESYSTEM_LIBRARY}) target_link_libraries (_boost_iostreams INTERFACE ${Boost_IOSTREAMS_LIBRARY}) target_link_libraries (_boost_program_options INTERFACE ${Boost_PROGRAM_OPTIONS_LIBRARY}) target_link_libraries (_boost_regex INTERFACE ${Boost_REGEX_LIBRARY}) target_link_libraries (_boost_system INTERFACE ${Boost_SYSTEM_LIBRARY}) + target_link_libraries (_boost_context INTERFACE ${Boost_CONTEXT_LIBRARY}) add_library (boost::filesystem ALIAS _boost_filesystem) add_library (boost::iostreams ALIAS _boost_iostreams) add_library (boost::program_options ALIAS _boost_program_options) add_library (boost::regex ALIAS _boost_regex) add_library (boost::system ALIAS _boost_system) + add_library (boost::context ALIAS _boost_context) else() set(EXTERNAL_BOOST_FOUND 0) message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system boost") @@ -142,4 +146,57 @@ if (NOT EXTERNAL_BOOST_FOUND) add_library (_boost_system ${SRCS_SYSTEM}) add_library (boost::system ALIAS _boost_system) target_include_directories (_boost_system PRIVATE ${LIBRARY_DIR}) + + # context + enable_language(ASM) + SET(ASM_OPTIONS "-x assembler-with-cpp") + + if (SANITIZE AND (SANITIZE STREQUAL "address" OR SANITIZE STREQUAL "thread")) + add_compile_definitions(BOOST_USE_UCONTEXT) + + if (SANITIZE STREQUAL "address") + add_compile_definitions(BOOST_USE_ASAN) + elseif (SANITIZE STREQUAL "thread") + add_compile_definitions(BOOST_USE_TSAN) + endif() + + set (SRCS_CONTEXT + ${LIBRARY_DIR}/libs/context/src/fiber.cpp + ${LIBRARY_DIR}/libs/context/src/continuation.cpp + ${LIBRARY_DIR}/libs/context/src/dummy.cpp + ${LIBRARY_DIR}/libs/context/src/execution_context.cpp + ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp + ) + elseif (ARCH_ARM) + set (SRCS_CONTEXT + ${LIBRARY_DIR}/libs/context/src/asm/jump_arm64_aapcs_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/make_arm64_aapcs_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/ontop_arm64_aapcs_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/dummy.cpp + ${LIBRARY_DIR}/libs/context/src/execution_context.cpp + ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp + ) + elseif(OS_DARWIN) + set (SRCS_CONTEXT + ${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_sysv_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_sysv_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_sysv_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/dummy.cpp + ${LIBRARY_DIR}/libs/context/src/execution_context.cpp + ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp + ) + else() + set (SRCS_CONTEXT + ${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_sysv_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_sysv_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_sysv_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/dummy.cpp + ${LIBRARY_DIR}/libs/context/src/execution_context.cpp + ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp + ) + endif() + + add_library (_boost_context ${SRCS_CONTEXT}) + add_library (boost::context ALIAS _boost_context) + target_include_directories (_boost_context PRIVATE ${LIBRARY_DIR}) endif () diff --git a/contrib/grpc-cmake/CMakeLists.txt b/contrib/grpc-cmake/CMakeLists.txt index efb0f1c4f43..97ca3fab4db 100644 --- a/contrib/grpc-cmake/CMakeLists.txt +++ b/contrib/grpc-cmake/CMakeLists.txt @@ -54,6 +54,26 @@ else () set(CARES_SHARED ON CACHE BOOL "" FORCE) endif () +# Disable looking for libnsl on a platforms that has gethostbyname in glibc +# +# c-ares searching for gethostbyname in the libnsl library, however in the +# version that shipped with gRPC it doing it wrong [1], since it uses +# CHECK_LIBRARY_EXISTS(), which will return TRUE even if the function exists in +# another dependent library. The upstream already contains correct macro [2], +# but it is not included in gRPC (even upstream gRPC, not the one that is +# shipped with clickhousee). +# +# [1]: https://github.com/c-ares/c-ares/blob/e982924acee7f7313b4baa4ee5ec000c5e373c30/CMakeLists.txt#L125 +# [2]: https://github.com/c-ares/c-ares/blob/44fbc813685a1fa8aa3f27fcd7544faf612d376a/CMakeLists.txt#L146 +# +# And because if you by some reason have libnsl [3] installed, clickhouse will +# reject to start w/o it. While this is completelly different library. +# +# [3]: https://packages.debian.org/bullseye/libnsl2 +if (NOT CMAKE_SYSTEM_NAME STREQUAL "SunOS") + set(HAVE_LIBNSL OFF CACHE BOOL "" FORCE) +endif() + # We don't want to build C# extensions. set(gRPC_BUILD_CSHARP_EXT OFF) diff --git a/debian/rules b/debian/rules index caf09324d3e..30015ba0afd 100755 --- a/debian/rules +++ b/debian/rules @@ -62,7 +62,7 @@ ifndef DISABLE_NINJA NINJA=$(shell which ninja) ifneq ($(NINJA),) CMAKE_FLAGS += -GNinja - export MAKE=$(NINJA) + export MAKE=$(NINJA) $(NINJA_FLAGS) endif endif diff --git a/docker/packager/unbundled/Dockerfile b/docker/packager/unbundled/Dockerfile index 2f501f76e68..c6ebe95d44a 100644 --- a/docker/packager/unbundled/Dockerfile +++ b/docker/packager/unbundled/Dockerfile @@ -21,6 +21,7 @@ RUN apt-get update \ libboost-thread-dev \ libboost-iostreams-dev \ libboost-regex-dev \ + libboost-context-dev \ zlib1g-dev \ liblz4-dev \ libdouble-conversion-dev \ diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh index 8793f3d22b4..e6e987e1d94 100755 --- a/docker/test/stateless/run.sh +++ b/docker/test/stateless/run.sh @@ -12,7 +12,32 @@ dpkg -i package_folder/clickhouse-test_*.deb # install test configs /usr/share/clickhouse-test/config/install.sh -service clickhouse-server start && sleep 5 +# For flaky check we also enable thread fuzzer +if [ "$NUM_TRIES" -gt "1" ]; then + export THREAD_FUZZER_CPU_TIME_PERIOD_US=1000 + export THREAD_FUZZER_SLEEP_PROBABILITY=0.1 + export THREAD_FUZZER_SLEEP_TIME_US=100000 + + export THREAD_FUZZER_pthread_mutex_lock_BEFORE_MIGRATE_PROBABILITY=1 + export THREAD_FUZZER_pthread_mutex_lock_AFTER_MIGRATE_PROBABILITY=1 + export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_MIGRATE_PROBABILITY=1 + export THREAD_FUZZER_pthread_mutex_unlock_AFTER_MIGRATE_PROBABILITY=1 + + export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_PROBABILITY=0.001 + export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_PROBABILITY=0.001 + export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_PROBABILITY=0.001 + export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_PROBABILITY=0.001 + export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_TIME_US=10000 + export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_TIME_US=10000 + export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_TIME_US=10000 + export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_TIME_US=10000 + + # simpliest way to forward env variables to server + sudo -E -u clickhouse /usr/bin/clickhouse-server --config /etc/clickhouse-server/config.xml --daemon + sleep 5 +else + service clickhouse-server start && sleep 5 +fi if grep -q -- "--use-skip-list" /usr/bin/clickhouse-test; then SKIP_LIST_OPT="--use-skip-list" diff --git a/docs/tools/purge_cache_for_changed_files.py b/docs/tools/purge_cache_for_changed_files.py deleted file mode 100644 index 6cfc9d18a57..00000000000 --- a/docs/tools/purge_cache_for_changed_files.py +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/env python3 - -import subprocess -import requests -import os -import time - -FNAME_START = "+++" - -CLOUDFLARE_URL = "https://api.cloudflare.com/client/v4/zones/4fc6fb1d46e87851605aa7fa69ca6fe0/purge_cache" - -# we have changes in revision and commit sha on all pages -# so such changes have to be ignored -MIN_CHANGED_WORDS = 4 - - -def collect_changed_files(): - proc = subprocess.Popen("git diff HEAD~1 --word-diff=porcelain | grep -e '^+[^+]\|^\-[^\-]\|^\+\+\+'", stdout=subprocess.PIPE, shell=True) - changed_files = [] - current_file_name = "" - changed_words = [] - while True: - line = proc.stdout.readline().decode("utf-8").strip() - if not line: - break - if FNAME_START in line: - if changed_words: - if len(changed_words) > MIN_CHANGED_WORDS: - changed_files.append(current_file_name) - changed_words = [] - current_file_name = line[6:] - else: - changed_words.append(line) - return changed_files - - -def filter_and_transform_changed_files(changed_files, base_domain): - result = [] - for f in changed_files: - if f.endswith(".html"): - result.append(base_domain + f.replace("index.html", "")) - return result - - -def convert_to_dicts(changed_files, batch_size): - result = [] - current_batch = {"files": []} - for f in changed_files: - if len(current_batch["files"]) >= batch_size: - result.append(current_batch) - current_batch = {"files": []} - current_batch["files"].append(f) - - if current_batch["files"]: - result.append(current_batch) - return result - - -def post_data(prepared_batches, token): - headers = {"Authorization": "Bearer {}".format(token)} - for batch in prepared_batches: - print(("Pugring cache for", ", ".join(batch["files"]))) - response = requests.post(CLOUDFLARE_URL, json=batch, headers=headers) - response.raise_for_status() - time.sleep(3) - - -if __name__ == "__main__": - token = os.getenv("CLOUDFLARE_TOKEN") - if not token: - raise Exception("Env variable CLOUDFLARE_TOKEN is empty") - base_domain = os.getenv("BASE_DOMAIN", "https://content.clickhouse.tech/") - changed_files = collect_changed_files() - print(("Found", len(changed_files), "changed files")) - filtered_files = filter_and_transform_changed_files(changed_files, base_domain) - print(("Files rest after filtering", len(filtered_files))) - prepared_batches = convert_to_dicts(filtered_files, 25) - post_data(prepared_batches, token) diff --git a/docs/tools/release.sh b/docs/tools/release.sh index a65827fc073..9e529b25d0f 100755 --- a/docs/tools/release.sh +++ b/docs/tools/release.sh @@ -32,12 +32,14 @@ then git add ".nojekyll" # Push to GitHub rewriting the existing contents. - git commit -a -m "Add new release at $(date)" + git commit --quiet -m "Add new release at $(date)" git push --force origin master if [[ ! -z "${CLOUDFLARE_TOKEN}" ]] then sleep 1m - python3 "${BASE_DIR}/purge_cache_for_changed_files.py" + # https://api.cloudflare.com/#zone-purge-files-by-cache-tags,-host-or-prefix + POST_DATA='{"hosts":"clickhouse.tech"}' + curl -X POST "https://api.cloudflare.com/client/v4/zones/4fc6fb1d46e87851605aa7fa69ca6fe0/purge_cache" -H "Authorization: Bearer ${CLOUDFLARE_TOKEN}" -H "Content-Type:application/json" --data "${POST_DATA}" fi fi diff --git a/docs/zh/sql-reference/data-types/datetime64.md b/docs/zh/sql-reference/data-types/datetime64.md index 2442972965a..46e8e9a5fa4 100644 --- a/docs/zh/sql-reference/data-types/datetime64.md +++ b/docs/zh/sql-reference/data-types/datetime64.md @@ -7,9 +7,9 @@ toc_title: DateTime64 # Datetime64 {#data_type-datetime64} -允许存储时间instant间,可以表示为日历日期和一天中的时间,具有定义的亚秒精度 +此类型允许以日期(date)加时间(time)的形式来存储一个时刻的时间值,具有定义的亚秒精度 -刻度尺寸(精度):10-精度 秒 +时间刻度大小(精度):10-精度 秒 语法: @@ -17,11 +17,11 @@ toc_title: DateTime64 DateTime64(precision, [timezone]) ``` -在内部,存储数据作为一些 ‘ticks’ 自纪元开始(1970-01-01 00:00:00UTC)作为Int64. 刻度分辨率由precision参数确定。 此外,该 `DateTime64` 类型可以存储时区是相同的整个列,影响如何的值 `DateTime64` 类型值以文本格式显示,以及如何解析指定为字符串的值 (‘2020-01-01 05:00:01.000’). 时区不存储在表的行中(或resultset中),而是存储在列元数据中。 查看详细信息 [日期时间](datetime.md). +在内部,此类型以Int64类型将数据存储为自Linux纪元开始(1970-01-01 00:00:00UTC)的时间刻度数(ticks)。时间刻度的分辨率由precision参数确定。此外,`DateTime64` 类型可以像存储其他数据列一样存储时区信息,时区会影响 `DateTime64` 类型的值如何以文本格式显示,以及如何解析以字符串形式指定的时间数据 (‘2020-01-01 05:00:01.000’)。时区不存储在表的行中(也不在resultset中),而是存储在列的元数据中。详细信息请参考 [DateTime](datetime.md) 数据类型. -## 例 {#examples} +## 示例 {#examples} -**1.** 创建一个表 `DateTime64`-输入列并将数据插入其中: +**1.** 创建一个具有 `DateTime64` 类型列的表,并向其中插入数据: ``` sql CREATE TABLE dt @@ -47,10 +47,10 @@ SELECT * FROM dt └─────────────────────────┴──────────┘ ``` -- 将日期时间作为整数插入时,将其视为适当缩放的Unix时间戳(UTC)。 `1546300800000` (精度为3)表示 `'2019-01-01 00:00:00'` UTC. 然而,作为 `timestamp` 列有 `Europe/Moscow` (UTC+3)指定的时区,当输出为字符串时,该值将显示为 `'2019-01-01 03:00:00'` -- 当插入字符串值作为日期时间时,它被视为处于列时区。 `'2019-01-01 00:00:00'` 将被视为 `Europe/Moscow` 时区并存储为 `1546290000000`. +- 将日期时间作为integer类型插入时,它会被视为适当缩放的Unix时间戳(UTC)。`1546300800000` (精度为3)表示 `'2019-01-01 00:00:00'` UTC. 不过,因为 `timestamp` 列指定了 `Europe/Moscow` (UTC+3)的时区,当作为字符串输出时,它将显示为 `'2019-01-01 03:00:00'` +- 当把字符串作为日期时间插入时,它会被赋予时区信息。 `'2019-01-01 00:00:00'` 将被认为处于 `Europe/Moscow` 时区并被存储为 `1546290000000`. -**2.** 过滤 `DateTime64` 值 +**2.** 过滤 `DateTime64` 类型的值 ``` sql SELECT * FROM dt WHERE timestamp = toDateTime64('2019-01-01 00:00:00', 3, 'Europe/Moscow') @@ -62,9 +62,9 @@ SELECT * FROM dt WHERE timestamp = toDateTime64('2019-01-01 00:00:00', 3, 'Europ └─────────────────────────┴──────────┘ ``` -不像 `DateTime`, `DateTime64` 值不转换为 `String` 自动 +与 `DateTime` 不同, `DateTime64` 类型的值不会自动从 `String` 类型的值转换过来 -**3.** 获取一个时区 `DateTime64`-类型值: +**3.** 获取 `DateTime64` 类型值的时区信息: ``` sql SELECT toDateTime64(now(), 3, 'Europe/Moscow') AS column, toTypeName(column) AS x @@ -97,8 +97,9 @@ FROM dt - [类型转换函数](../../sql-reference/functions/type-conversion-functions.md) - [用于处理日期和时间的函数](../../sql-reference/functions/date-time-functions.md) - [用于处理数组的函数](../../sql-reference/functions/array-functions.md) -- [该 `date_time_input_format` 设置](../../operations/settings/settings.md#settings-date_time_input_format) -- [该 `timezone` 服务器配置参数](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) -- [使用日期和时间的操作员](../../sql-reference/operators/index.md#operators-datetime) +- [`date_time_input_format` 配置](../../operations/settings/settings.md#settings-date_time_input_format) +- [`date_time_output_format` 配置](../../operations/settings/settings.md#settings-date_time_output_format) +- [`timezone` 服务器配置参数](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) +- [用于处理日期和时间的算子](../../sql-reference/operators/index.md#operators-datetime) - [`Date` 数据类型](date.md) - [`DateTime` 数据类型](datetime.md) diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index 9d0227a51e3..a8c4f070bea 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -949,6 +949,11 @@ private: TestHint test_hint(test_mode, all_queries_text); if (test_hint.clientError() || test_hint.serverError()) processTextAsSingleQuery("SET send_logs_level = 'none'"); + + // Echo all queries if asked; makes for a more readable reference + // file. + if (test_hint.echoQueries()) + echo_queries = true; } /// Several queries separated by ';'. diff --git a/programs/client/QueryFuzzer.cpp b/programs/client/QueryFuzzer.cpp index a8e32d87db5..53ede4a3d92 100644 --- a/programs/client/QueryFuzzer.cpp +++ b/programs/client/QueryFuzzer.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -28,6 +29,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int TOO_DEEP_RECURSION; +} + Field QueryFuzzer::getRandomField(int type) { switch (type) @@ -205,14 +211,88 @@ void QueryFuzzer::replaceWithTableLike(ASTPtr & ast) ast = new_ast; } -void QueryFuzzer::fuzzColumnLikeExpressionList(ASTPtr ast) +void QueryFuzzer::fuzzOrderByElement(ASTOrderByElement * elem) +{ + switch (fuzz_rand() % 10) + { + case 0: + elem->direction = -1; + break; + case 1: + elem->direction = 1; + break; + case 2: + elem->nulls_direction = -1; + elem->nulls_direction_was_explicitly_specified = true; + break; + case 3: + elem->nulls_direction = 1; + elem->nulls_direction_was_explicitly_specified = true; + break; + case 4: + elem->nulls_direction = elem->direction; + elem->nulls_direction_was_explicitly_specified = false; + break; + default: + // do nothing + break; + } +} + +void QueryFuzzer::fuzzOrderByList(IAST * ast) { if (!ast) { return; } - auto * impl = assert_cast(ast.get()); + auto * list = assert_cast(ast); + + // Remove element + if (fuzz_rand() % 50 == 0 && list->children.size() > 1) + { + // Don't remove last element -- this leads to questionable + // constructs such as empty select. + list->children.erase(list->children.begin() + + fuzz_rand() % list->children.size()); + } + + // Add element + if (fuzz_rand() % 50 == 0) + { + auto pos = list->children.empty() + ? list->children.begin() + : list->children.begin() + fuzz_rand() % list->children.size(); + auto col = getRandomColumnLike(); + if (col) + { + auto elem = std::make_shared(); + elem->children.push_back(col); + elem->direction = 1; + elem->nulls_direction = 1; + elem->nulls_direction_was_explicitly_specified = false; + elem->with_fill = false; + + list->children.insert(pos, elem); + } + else + { + fprintf(stderr, "no random col!\n"); + } + } + + // We don't have to recurse here to fuzz the children, this is handled by + // the generic recursion into IAST.children. +} + +void QueryFuzzer::fuzzColumnLikeExpressionList(IAST * ast) +{ + if (!ast) + { + return; + } + + auto * impl = assert_cast(ast); // Remove element if (fuzz_rand() % 50 == 0 && impl->children.size() > 1) @@ -252,11 +332,44 @@ void QueryFuzzer::fuzz(ASTs & asts) } } +struct ScopedIncrement +{ + size_t & counter; + + explicit ScopedIncrement(size_t & counter_) : counter(counter_) { ++counter; } + ~ScopedIncrement() { --counter; } +}; + void QueryFuzzer::fuzz(ASTPtr & ast) { if (!ast) return; + // Check for exceeding max depth. + ScopedIncrement depth_increment(current_ast_depth); + if (current_ast_depth > 500) + { + // The AST is too deep (see the comment for current_ast_depth). Throw + // an exception to fail fast and not use this query as an etalon, or we'll + // end up in a very slow and useless loop. It also makes sense to set it + // lower than the default max parse depth on the server (1000), so that + // we don't get the useless error about parse depth from the server either. + throw Exception(ErrorCodes::TOO_DEEP_RECURSION, + "AST depth exceeded while fuzzing ({})", current_ast_depth); + } + + // Check for loops. + auto [_, inserted] = debug_visited_nodes.insert(ast.get()); + if (!inserted) + { + fmt::print(stderr, "The AST node '{}' was already visited before." + " Depth {}, {} visited nodes, current top AST:\n{}\n", + static_cast(ast.get()), current_ast_depth, + debug_visited_nodes.size(), (*debug_top_ast)->dumpTree()); + assert(false); + } + + // The fuzzing. if (auto * with_union = typeid_cast(ast.get())) { fuzz(with_union->list_of_selects); @@ -281,17 +394,28 @@ void QueryFuzzer::fuzz(ASTPtr & ast) { fuzz(expr_list->children); } + else if (auto * order_by_element = typeid_cast(ast.get())) + { + fuzzOrderByElement(order_by_element); + } else if (auto * fn = typeid_cast(ast.get())) { - fuzzColumnLikeExpressionList(fn->arguments); - fuzzColumnLikeExpressionList(fn->parameters); + fuzzColumnLikeExpressionList(fn->arguments.get()); + fuzzColumnLikeExpressionList(fn->parameters.get()); + + if (fn->is_window_function) + { + fuzzColumnLikeExpressionList(fn->window_partition_by); + fuzzOrderByList(fn->window_order_by); + } fuzz(fn->children); } else if (auto * select = typeid_cast(ast.get())) { - fuzzColumnLikeExpressionList(select->select()); - fuzzColumnLikeExpressionList(select->groupBy()); + fuzzColumnLikeExpressionList(select->select().get()); + fuzzColumnLikeExpressionList(select->groupBy().get()); + fuzzOrderByList(select->orderBy().get()); fuzz(select->children); } @@ -416,6 +540,10 @@ void QueryFuzzer::collectFuzzInfoRecurse(const ASTPtr ast) void QueryFuzzer::fuzzMain(ASTPtr & ast) { + current_ast_depth = 0; + debug_visited_nodes.clear(); + debug_top_ast = * + collectFuzzInfoMain(ast); fuzz(ast); diff --git a/programs/client/QueryFuzzer.h b/programs/client/QueryFuzzer.h index 0c7cec8dc84..e9d3f150283 100644 --- a/programs/client/QueryFuzzer.h +++ b/programs/client/QueryFuzzer.h @@ -12,6 +12,9 @@ namespace DB { +class ASTExpressionList; +class ASTOrderByElement; + /* * This is an AST-based query fuzzer that makes random modifications to query * AST, changing numbers, list of columns, functions, etc. It remembers part of @@ -23,6 +26,13 @@ struct QueryFuzzer { pcg64 fuzz_rand{randomSeed()}; + // We add elements to expression lists with fixed probability. Some elements + // are so large, that the expected number of elements we add to them is + // one or higher, hence this process might never finish. Put some limit on the + // total depth of AST to prevent this. + // This field is reset for each fuzzMain() call. + size_t current_ast_depth = 0; + // These arrays hold parts of queries that we can substitute into the query // we are currently fuzzing. We add some part from each new query we are asked // to fuzz, and keep this state between queries, so the fuzzing output becomes @@ -36,6 +46,12 @@ struct QueryFuzzer std::unordered_map table_like_map; std::vector table_like; + // Some debug fields for detecting problematic ASTs with loops. + // These are reset for each fuzzMain call. + std::unordered_set debug_visited_nodes; + ASTPtr * debug_top_ast; + + // This is the only function you have to call -- it will modify the passed // ASTPtr to point to new AST with some random changes. void fuzzMain(ASTPtr & ast); @@ -46,7 +62,9 @@ struct QueryFuzzer ASTPtr getRandomColumnLike(); void replaceWithColumnLike(ASTPtr & ast); void replaceWithTableLike(ASTPtr & ast); - void fuzzColumnLikeExpressionList(ASTPtr ast); + void fuzzOrderByElement(ASTOrderByElement * elem); + void fuzzOrderByList(IAST * ast); + void fuzzColumnLikeExpressionList(IAST * ast); void fuzz(ASTs & asts); void fuzz(ASTPtr & ast); void collectFuzzInfoMain(const ASTPtr ast); diff --git a/programs/client/TestHint.h b/programs/client/TestHint.h index 65666f4304c..f1998588261 100644 --- a/programs/client/TestHint.h +++ b/programs/client/TestHint.h @@ -19,6 +19,7 @@ namespace ErrorCodes /// Checks expected server and client error codes in testmode. /// To enable it add special comment after the query: "-- { serverError 60 }" or "-- { clientError 20 }". +/// Also you can enable echoing all queries by writing "-- { echo }". class TestHint { public: @@ -84,12 +85,14 @@ public: int serverError() const { return server_error; } int clientError() const { return client_error; } + bool echoQueries() const { return echo; } private: bool enabled = false; const String & query; int server_error = 0; int client_error = 0; + bool echo = false; void parse(const String & hint) { @@ -107,6 +110,8 @@ private: ss >> server_error; else if (item == "clientError") ss >> client_error; + else if (item == "echo") + echo = true; } } diff --git a/programs/install/Install.cpp b/programs/install/Install.cpp index 9e3942e126d..2fba335bc66 100644 --- a/programs/install/Install.cpp +++ b/programs/install/Install.cpp @@ -10,6 +10,10 @@ #include #endif +#if defined(OS_DARWIN) + #include +#endif + #include #include #include @@ -147,9 +151,24 @@ int mainEntryClickHouseInstall(int argc, char ** argv) try { /// We need to copy binary to the binary directory. - /// The binary is currently run. We need to obtain its path from procfs. + /// The binary is currently run. We need to obtain its path from procfs (on Linux). +#if defined(OS_DARWIN) + uint32_t path_length = 0; + _NSGetExecutablePath(nullptr, &path_length); + if (path_length <= 1) + Exception(ErrorCodes::FILE_DOESNT_EXIST, "Cannot obtain path to the binary"); + + std::string path(path_length, std::string::value_type()); + auto res = _NSGetExecutablePath(&path[0], &path_length); + if (res != 0) + Exception(ErrorCodes::FILE_DOESNT_EXIST, "Cannot obtain path to the binary"); + + fs::path binary_self_path(path); +#else fs::path binary_self_path = "/proc/self/exe"; +#endif + if (!fs::exists(binary_self_path)) throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Cannot obtain path to the binary from {}, file doesn't exist", binary_self_path.string()); diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 9d7e78ac50d..f87b4294587 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -103,6 +104,7 @@ namespace CurrentMetrics int mainEntryClickHouseServer(int argc, char ** argv) { DB::Server app; + app.shouldSetupWatchdog(argc ? argv[0] : nullptr); try { return app.run(argc, argv); @@ -366,6 +368,7 @@ void checkForUsersNotInMainConfig( int Server::main(const std::vector & /*args*/) { Poco::Logger * log = &logger(); + UseSSL use_ssl; MainThreadStatus::getInstance(); diff --git a/src/AggregateFunctions/AggregateFunctionAvg.h b/src/AggregateFunctions/AggregateFunctionAvg.h index d07ff5db2f2..a46796b9c8d 100644 --- a/src/AggregateFunctions/AggregateFunctionAvg.h +++ b/src/AggregateFunctions/AggregateFunctionAvg.h @@ -127,10 +127,10 @@ public: void insertResultInto(AggregateDataPtr place, IColumn & to, Arena *) const override { if constexpr (IsDecimalNumber || IsDecimalNumber) - static_cast &>(to).getData().push_back( + assert_cast &>(to).getData().push_back( this->data(place).divideIfAnyDecimal(num_scale, denom_scale)); else - static_cast &>(to).getData().push_back(this->data(place).divide()); + assert_cast &>(to).getData().push_back(this->data(place).divide()); } private: UInt32 num_scale; diff --git a/src/AggregateFunctions/IAggregateFunction.h b/src/AggregateFunctions/IAggregateFunction.h index b591bd3acd7..d5e931ccc73 100644 --- a/src/AggregateFunctions/IAggregateFunction.h +++ b/src/AggregateFunctions/IAggregateFunction.h @@ -104,9 +104,12 @@ public: return false; } - /// Inserts results into a column. - /// This method must be called once, from single thread. - /// After this method was called for state, you can't do anything with state but destroy. + /// Inserts results into a column. This method might modify the state (e.g. + /// sort an array), so must be called once, from single thread. The state + /// must remain valid though, and the subsequent calls to add/merge/ + /// insertResultInto must work correctly. This kind of call sequence occurs + /// in `runningAccumulate`, or when calculating an aggregate function as a + /// window function. virtual void insertResultInto(AggregateDataPtr place, IColumn & to, Arena * arena) const = 0; /// Used for machine learning methods. Predict result from trained model. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7596d85571c..5f655f1f466 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -436,6 +436,8 @@ if (USE_ROCKSDB) dbms_target_include_directories(SYSTEM BEFORE PUBLIC ${ROCKSDB_INCLUDE_DIR}) endif() +dbms_target_link_libraries(PRIVATE _boost_context) + if (ENABLE_TESTS AND USE_GTEST) macro (grep_gtest_sources BASE_DIR DST_VAR) # Cold match files that are not in tests/ directories diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index 8f4a64766cd..ef114490c51 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -742,8 +742,11 @@ std::optional Connection::checkPacket(size_t timeout_microseconds) } -Packet Connection::receivePacket() +Packet Connection::receivePacket(std::function async_callback) { + in->setAsyncCallback(std::move(async_callback)); + SCOPE_EXIT(in->setAsyncCallback({})); + try { Packet res; diff --git a/src/Client/Connection.h b/src/Client/Connection.h index 265a9913b8e..83e8f3ba206 100644 --- a/src/Client/Connection.h +++ b/src/Client/Connection.h @@ -18,6 +18,7 @@ #include #include +#include #include @@ -171,7 +172,8 @@ public: std::optional checkPacket(size_t timeout_microseconds = 0); /// Receive packet from server. - Packet receivePacket(); + /// Each time read blocks and async_callback is set, it will be called. You can poll socket inside it. + Packet receivePacket(std::function async_callback = {}); /// If not connected yet, or if connection is broken - then connect. If cannot connect - throw an exception. void forceConnected(const ConnectionTimeouts & timeouts); @@ -226,7 +228,7 @@ private: String server_display_name; std::unique_ptr socket; - std::shared_ptr in; + std::shared_ptr in; std::shared_ptr out; std::optional last_input_packet_type; diff --git a/src/Client/MultiplexedConnections.cpp b/src/Client/MultiplexedConnections.cpp index a99b0f9d7cc..ed7aad0a515 100644 --- a/src/Client/MultiplexedConnections.cpp +++ b/src/Client/MultiplexedConnections.cpp @@ -237,7 +237,7 @@ std::string MultiplexedConnections::dumpAddressesUnlocked() const return buf.str(); } -Packet MultiplexedConnections::receivePacketUnlocked() +Packet MultiplexedConnections::receivePacketUnlocked(std::function async_callback) { if (!sent_query) throw Exception("Cannot receive packets: no query sent.", ErrorCodes::LOGICAL_ERROR); @@ -249,7 +249,7 @@ Packet MultiplexedConnections::receivePacketUnlocked() if (current_connection == nullptr) throw Exception("Logical error: no available replica", ErrorCodes::NO_AVAILABLE_REPLICA); - Packet packet = current_connection->receivePacket(); + Packet packet = current_connection->receivePacket(std::move(async_callback)); switch (packet.type) { diff --git a/src/Client/MultiplexedConnections.h b/src/Client/MultiplexedConnections.h index eaec7f744bc..46312ae339d 100644 --- a/src/Client/MultiplexedConnections.h +++ b/src/Client/MultiplexedConnections.h @@ -69,7 +69,7 @@ public: private: /// Internal version of `receivePacket` function without locking. - Packet receivePacketUnlocked(); + Packet receivePacketUnlocked(std::function async_callback = {}); /// Internal version of `dumpAddresses` function without locking. std::string dumpAddressesUnlocked() const; @@ -105,6 +105,8 @@ private: /// A mutex for the sendCancel function to execute safely /// in separate thread. mutable std::mutex cancel_mutex; + + friend class RemoteQueryExecutorReadContext; }; } diff --git a/src/Common/Fiber.h b/src/Common/Fiber.h new file mode 100644 index 00000000000..e4efc0bdb6a --- /dev/null +++ b/src/Common/Fiber.h @@ -0,0 +1,5 @@ +#pragma once +#include +#include + +using Fiber = boost::context::fiber; diff --git a/src/Common/FiberStack.h b/src/Common/FiberStack.h new file mode 100644 index 00000000000..2917a64a692 --- /dev/null +++ b/src/Common/FiberStack.h @@ -0,0 +1,74 @@ +#pragma once +#include +#include +#include +#include + +#include +#include +#include + +#if defined(BOOST_USE_VALGRIND) +#include +#endif + +namespace DB::ErrorCodes +{ + extern const int CANNOT_ALLOCATE_MEMORY; +} + +/// This is an implementation of allocator for fiber stack. +/// The reference implementation is protected_fixedsize_stack from boost::context. +/// This implementation additionally track memory usage. It is the main reason why it is needed. +class FiberStack +{ +private: + size_t stack_size; + size_t page_size = 0; +public: + static constexpr size_t default_stack_size = 128 * 1024; /// 64KB was not enough for tests + + explicit FiberStack(size_t stack_size_ = default_stack_size) : stack_size(stack_size_) + { + page_size = ::sysconf(_SC_PAGESIZE); + } + + boost::context::stack_context allocate() + { + size_t num_pages = 1 + (stack_size - 1) / page_size; + size_t num_bytes = (num_pages + 1) * page_size; /// Add one page at bottom that will be used as guard-page + + void * vp = ::mmap(nullptr, num_bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (MAP_FAILED == vp) + DB::throwFromErrno(fmt::format("FiberStack: Cannot mmap {}.", ReadableSize(num_bytes)), DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY); + + if (-1 == ::mprotect(vp, page_size, PROT_NONE)) + { + ::munmap(vp, num_bytes); + DB::throwFromErrno("FiberStack: cannot protect guard page", DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY); + } + + /// Do not count guard page in memory usage. + CurrentMemoryTracker::alloc(num_pages * page_size); + + boost::context::stack_context sctx; + sctx.size = num_bytes; + sctx.sp = static_cast< char * >(vp) + sctx.size; +#if defined(BOOST_USE_VALGRIND) + sctx.valgrind_stack_id = VALGRIND_STACK_REGISTER(sctx.sp, vp); +#endif + return sctx; + } + + void deallocate(boost::context::stack_context & sctx) + { +#if defined(BOOST_USE_VALGRIND) + VALGRIND_STACK_DEREGISTER(sctx.valgrind_stack_id); +#endif + void * vp = static_cast< char * >(sctx.sp) - sctx.size; + ::munmap(vp, sctx.size); + + /// Do not count guard page in memory usage. + CurrentMemoryTracker::free(sctx.size - page_size); + } +}; diff --git a/src/Common/ThreadFuzzer.cpp b/src/Common/ThreadFuzzer.cpp index a32e50c44b2..88ff53534e6 100644 --- a/src/Common/ThreadFuzzer.cpp +++ b/src/Common/ThreadFuzzer.cpp @@ -10,6 +10,7 @@ #include #include +#include #include #include diff --git a/src/Common/TimerDescriptor.cpp b/src/Common/TimerDescriptor.cpp new file mode 100644 index 00000000000..f4c3ec35588 --- /dev/null +++ b/src/Common/TimerDescriptor.cpp @@ -0,0 +1,84 @@ +#if defined(OS_LINUX) +#include +#include + +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_CREATE_TIMER; + extern const int CANNOT_SET_TIMER_PERIOD; + extern const int CANNOT_FCNTL; + extern const int CANNOT_READ_FROM_SOCKET; +} + +TimerDescriptor::TimerDescriptor(int clockid, int flags) +{ + timer_fd = timerfd_create(clockid, flags); + if (timer_fd == -1) + throw Exception(ErrorCodes::CANNOT_CREATE_TIMER, "Cannot create timer_fd descriptor"); + + if (-1 == fcntl(timer_fd, F_SETFL, O_NONBLOCK)) + throwFromErrno("Cannot set O_NONBLOCK for timer_fd", ErrorCodes::CANNOT_FCNTL); +} + +TimerDescriptor::~TimerDescriptor() +{ + /// Do not check for result cause cannot throw exception. + close(timer_fd); +} + +void TimerDescriptor::reset() const +{ + itimerspec spec; + spec.it_interval.tv_nsec = 0; + spec.it_interval.tv_sec = 0; + spec.it_value.tv_sec = 0; + spec.it_value.tv_nsec = 0; + + if (-1 == timerfd_settime(timer_fd, 0 /*relative timer */, &spec, nullptr)) + throwFromErrno("Cannot reset timer_fd", ErrorCodes::CANNOT_SET_TIMER_PERIOD); + + /// Drain socket. + /// It may be possible that alarm happened and socket is readable. + drain(); +} + +void TimerDescriptor::drain() const +{ + /// It is expected that socket returns 8 bytes when readable. + /// Read in loop anyway cause signal may interrupt read call. + uint64_t buf; + while (true) + { + ssize_t res = ::read(timer_fd, &buf, sizeof(buf)); + if (res < 0) + { + if (errno == EAGAIN) + break; + + if (errno != EINTR) + throwFromErrno("Cannot drain timer_fd", ErrorCodes::CANNOT_READ_FROM_SOCKET); + } + } +} + +void TimerDescriptor::setRelative(const Poco::Timespan & timespan) const +{ + itimerspec spec; + spec.it_interval.tv_nsec = 0; + spec.it_interval.tv_sec = 0; + spec.it_value.tv_sec = timespan.totalSeconds(); + spec.it_value.tv_nsec = timespan.useconds(); + + if (-1 == timerfd_settime(timer_fd, 0 /*relative timer */, &spec, nullptr)) + throwFromErrno("Cannot set time for timer_fd", ErrorCodes::CANNOT_SET_TIMER_PERIOD); +} + +} +#endif diff --git a/src/Common/TimerDescriptor.h b/src/Common/TimerDescriptor.h new file mode 100644 index 00000000000..ddb8f2a1367 --- /dev/null +++ b/src/Common/TimerDescriptor.h @@ -0,0 +1,31 @@ +#pragma once +#if defined(OS_LINUX) +#include + +namespace DB +{ + +/// Wrapper over timerfd. +class TimerDescriptor +{ +private: + int timer_fd; + +public: + explicit TimerDescriptor(int clockid, int flags); + ~TimerDescriptor(); + + TimerDescriptor(const TimerDescriptor &) = delete; + TimerDescriptor & operator=(const TimerDescriptor &) = delete; + TimerDescriptor(TimerDescriptor &&) = default; + TimerDescriptor & operator=(TimerDescriptor &&) = default; + + int getDescriptor() const { return timer_fd; } + + void reset() const; + void drain() const; + void setRelative(const Poco::Timespan & timespan) const; +}; + +} +#endif diff --git a/src/Common/ZooKeeper/tests/CMakeLists.txt b/src/Common/ZooKeeper/tests/CMakeLists.txt index 7092720e7bc..bbfa3e1f137 100644 --- a/src/Common/ZooKeeper/tests/CMakeLists.txt +++ b/src/Common/ZooKeeper/tests/CMakeLists.txt @@ -4,9 +4,6 @@ target_link_libraries(zkutil_test_commands PRIVATE clickhouse_common_zookeeper) add_executable(zkutil_test_commands_new_lib zkutil_test_commands_new_lib.cpp) target_link_libraries(zkutil_test_commands_new_lib PRIVATE clickhouse_common_zookeeper string_utils) -add_executable(zkutil_expiration_test zkutil_expiration_test.cpp) -target_link_libraries(zkutil_expiration_test PRIVATE clickhouse_common_zookeeper) - add_executable(zkutil_test_async zkutil_test_async.cpp) target_link_libraries(zkutil_test_async PRIVATE clickhouse_common_zookeeper) diff --git a/src/Common/ZooKeeper/tests/nozk.sh b/src/Common/ZooKeeper/tests/nozk.sh deleted file mode 100755 index c4e409f735b..00000000000 --- a/src/Common/ZooKeeper/tests/nozk.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env bash - -# Добавляет в файрвол правила, не пропускающие пакеты до серверов ZooKeeper. -# Используется для тестирования поведения программ при потере соединения с ZooKeeper. -# yeszk.sh производит обратные изменения. - -# Чтобы посмотреть, какие правила сейчас есть, используйте sudo iptables -L и sudo ip6tables -L - -sudo iptables -A OUTPUT -p tcp --dport 2181 -j DROP -sudo ip6tables -A OUTPUT -p tcp --dport 2181 -j DROP - -# You could also test random drops: -#sudo iptables -A OUTPUT -p tcp --dport 2181 -j REJECT --reject-with tcp-reset -m statistic --mode random --probability 0.1 -#sudo ip6tables -A OUTPUT -p tcp --dport 2181 -j REJECT --reject-with tcp-reset -m statistic --mode random --probability 0.1 - diff --git a/src/Common/ZooKeeper/tests/yeszk.sh b/src/Common/ZooKeeper/tests/yeszk.sh deleted file mode 100755 index 4f186a90183..00000000000 --- a/src/Common/ZooKeeper/tests/yeszk.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env bash - -# Выполняет действия, обратные nozk.sh - -cat nozk.sh | sed 's/-A/-D/g' | bash - diff --git a/src/Common/ZooKeeper/tests/zkutil_expiration_test.cpp b/src/Common/ZooKeeper/tests/zkutil_expiration_test.cpp deleted file mode 100644 index e09c72a4d6c..00000000000 --- a/src/Common/ZooKeeper/tests/zkutil_expiration_test.cpp +++ /dev/null @@ -1,70 +0,0 @@ -#include -#include -#include -#include -#include - - -/// Проверяет, какие ошибки выдает ZooKeeper при попытке сделать какую-нибудь операцию через разное время после истечения сессии. -/// Спойлер: multi иногда падает с segfault, а до этого фейлится с marshalling error. -/// create всегда фейлится с invalid zhandle state. - -int main(int argc, char ** argv) -{ - try - { - if (argc != 2) - { - std::cerr << "usage: " << argv[0] << " hosts" << std::endl; - return 2; - } - - Poco::AutoPtr channel = new Poco::ConsoleChannel(std::cerr); - Poco::Logger::root().setChannel(channel); - Poco::Logger::root().setLevel("trace"); - - zkutil::ZooKeeper zk(argv[1]); - std::string unused; - zk.tryCreate("/test", "", zkutil::CreateMode::Persistent, unused); - - std::cerr << "Please run `./nozk.sh && sleep 40s && ./yeszk.sh`" << std::endl; - - time_t time0 = time(nullptr); - - while (true) - { - { - Coordination::Requests ops; - ops.emplace_back(zkutil::makeCreateRequest("/test/zk_expiration_test", "hello", zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeRemoveRequest("/test/zk_expiration_test", -1)); - - Coordination::Responses responses; - Coordination::Error code = zk.tryMultiNoThrow(ops, responses); - - std::cout << time(nullptr) - time0 << "s: " << Coordination::errorMessage(code) << std::endl; - try - { - if (code != Coordination::Error::ZOK) - std::cout << "Path: " << zkutil::KeeperMultiException(code, ops, responses).getPathForFirstFailedOp() << std::endl; - } - catch (...) - { - std::cout << DB::getCurrentExceptionMessage(false) << std::endl; - } - - } - - sleep(1); - } - } - catch (Coordination::Exception &) - { - std::cerr << "KeeperException: " << DB::getCurrentExceptionMessage(true) << std::endl; - return 1; - } - catch (...) - { - std::cerr << "Some exception: " << DB::getCurrentExceptionMessage(true) << std::endl; - return 2; - } -} diff --git a/src/Common/ya.make b/src/Common/ya.make index c8be43f98a2..07175029276 100644 --- a/src/Common/ya.make +++ b/src/Common/ya.make @@ -75,6 +75,7 @@ SRCS( ThreadPool.cpp ThreadProfileEvents.cpp ThreadStatus.cpp + TimerDescriptor.cpp TraceCollector.cpp UTF8Helpers.cpp UnicodeBar.cpp diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 1f13d08aa55..bad149fcfa8 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -406,9 +406,12 @@ class IColumn; M(Bool, optimize_skip_merged_partitions, false, "Skip partitions with one part with level > 0 in optimize final", 0) \ M(Bool, optimize_on_insert, true, "Do the same transformation for inserted block of data as if merge was done on this block.", 0) \ M(Bool, allow_experimental_map_type, false, "Allow data type Map", 0) \ + M(Bool, allow_experimental_window_functions, false, "Allow experimental window functions", 0) \ \ M(Bool, use_antlr_parser, false, "Parse incoming queries using ANTLR-generated parser", 0) \ \ + M(Bool, async_socket_for_remote, true, "Asynchronously read from socket executing remote query", 0) \ + \ /** Obsolete settings that do nothing but left for compatibility reasons. Remove each one after half a year of obsolescence. */ \ \ M(UInt64, max_memory_usage_for_all_queries, 0, "Obsolete. Will be removed after 2020-10-20", 0) \ diff --git a/src/Core/SortDescription.cpp b/src/Core/SortDescription.cpp index 4a5952c3bc2..cb7378cf096 100644 --- a/src/Core/SortDescription.cpp +++ b/src/Core/SortDescription.cpp @@ -37,5 +37,12 @@ void dumpSortDescription(const SortDescription & description, const Block & head } } +std::string dumpSortDescription(const SortDescription & description) +{ + WriteBufferFromOwnString wb; + dumpSortDescription(description, Block{}, wb); + return wb.str(); +} + } diff --git a/src/Core/SortDescription.h b/src/Core/SortDescription.h index 79ac7ddf142..1450393ebd8 100644 --- a/src/Core/SortDescription.h +++ b/src/Core/SortDescription.h @@ -72,4 +72,6 @@ class Block; /// Outputs user-readable description into `out`. void dumpSortDescription(const SortDescription & description, const Block & header, WriteBuffer & out); +std::string dumpSortDescription(const SortDescription & description); + } diff --git a/src/DataStreams/RemoteQueryExecutor.cpp b/src/DataStreams/RemoteQueryExecutor.cpp index c38f42893af..51d229a1126 100644 --- a/src/DataStreams/RemoteQueryExecutor.cpp +++ b/src/DataStreams/RemoteQueryExecutor.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -11,6 +12,7 @@ #include #include #include +#include namespace DB { @@ -192,68 +194,119 @@ Block RemoteQueryExecutor::read() Packet packet = multiplexed_connections->receivePacket(); - switch (packet.type) - { - case Protocol::Server::Data: - /// If the block is not empty and is not a header block - if (packet.block && (packet.block.rows() > 0)) - return adaptBlockStructure(packet.block, header); - break; /// If the block is empty - we will receive other packets before EndOfStream. - - case Protocol::Server::Exception: - got_exception_from_replica = true; - packet.exception->rethrow(); - break; - - case Protocol::Server::EndOfStream: - if (!multiplexed_connections->hasActiveConnections()) - { - finished = true; - return Block(); - } - break; - - case Protocol::Server::Progress: - /** We use the progress from a remote server. - * We also include in ProcessList, - * and we use it to check - * constraints (for example, the minimum speed of query execution) - * and quotas (for example, the number of lines to read). - */ - if (progress_callback) - progress_callback(packet.progress); - break; - - case Protocol::Server::ProfileInfo: - /// Use own (client-side) info about read bytes, it is more correct info than server-side one. - if (profile_info_callback) - profile_info_callback(packet.profile_info); - break; - - case Protocol::Server::Totals: - totals = packet.block; - break; - - case Protocol::Server::Extremes: - extremes = packet.block; - break; - - case Protocol::Server::Log: - /// Pass logs from remote server to client - if (auto log_queue = CurrentThread::getInternalTextLogsQueue()) - log_queue->pushBlock(std::move(packet.block)); - break; - - default: - got_unknown_packet_from_replica = true; - throw Exception(ErrorCodes::UNKNOWN_PACKET_FROM_SERVER, "Unknown packet {} from one of the following replicas: {}", - toString(packet.type), - multiplexed_connections->dumpAddresses()); - } + if (auto block = processPacket(std::move(packet))) + return *block; } } -void RemoteQueryExecutor::finish() +std::variant RemoteQueryExecutor::read(std::unique_ptr & read_context [[maybe_unused]]) +{ + +#if defined(OS_LINUX) + if (!sent_query) + { + sendQuery(); + + if (context.getSettingsRef().skip_unavailable_shards && (0 == multiplexed_connections->size())) + return Block(); + } + + if (!read_context) + { + std::lock_guard lock(was_cancelled_mutex); + if (was_cancelled) + return Block(); + + read_context = std::make_unique(*multiplexed_connections); + } + + do + { + if (!read_context->resumeRoutine()) + return Block(); + + if (read_context->is_read_in_progress) + { + read_context->setTimer(); + return read_context->epoll_fd; + } + else + { + if (auto data = processPacket(std::move(read_context->packet))) + return std::move(*data); + } + } + while (true); +#else + return read(); +#endif +} + +std::optional RemoteQueryExecutor::processPacket(Packet packet) +{ + switch (packet.type) + { + case Protocol::Server::Data: + /// If the block is not empty and is not a header block + if (packet.block && (packet.block.rows() > 0)) + return adaptBlockStructure(packet.block, header); + break; /// If the block is empty - we will receive other packets before EndOfStream. + + case Protocol::Server::Exception: + got_exception_from_replica = true; + packet.exception->rethrow(); + break; + + case Protocol::Server::EndOfStream: + if (!multiplexed_connections->hasActiveConnections()) + { + finished = true; + return Block(); + } + break; + + case Protocol::Server::Progress: + /** We use the progress from a remote server. + * We also include in ProcessList, + * and we use it to check + * constraints (for example, the minimum speed of query execution) + * and quotas (for example, the number of lines to read). + */ + if (progress_callback) + progress_callback(packet.progress); + break; + + case Protocol::Server::ProfileInfo: + /// Use own (client-side) info about read bytes, it is more correct info than server-side one. + if (profile_info_callback) + profile_info_callback(packet.profile_info); + break; + + case Protocol::Server::Totals: + totals = packet.block; + break; + + case Protocol::Server::Extremes: + extremes = packet.block; + break; + + case Protocol::Server::Log: + /// Pass logs from remote server to client + if (auto log_queue = CurrentThread::getInternalTextLogsQueue()) + log_queue->pushBlock(std::move(packet.block)); + break; + + default: + got_unknown_packet_from_replica = true; + throw Exception(ErrorCodes::UNKNOWN_PACKET_FROM_SERVER, "Unknown packet {} from one of the following replicas: {}", + toString(packet.type), + multiplexed_connections->dumpAddresses()); + } + + return {}; +} + +void RemoteQueryExecutor::finish(std::unique_ptr * read_context) { /** If one of: * - nothing started to do; @@ -270,7 +323,7 @@ void RemoteQueryExecutor::finish() */ /// Send the request to abort the execution of the request, if not already sent. - tryCancel("Cancelling query because enough data has been read"); + tryCancel("Cancelling query because enough data has been read", read_context); /// Get the remaining packets so that there is no out of sync in the connections to the replicas. Packet packet = multiplexed_connections->drain(); @@ -299,7 +352,7 @@ void RemoteQueryExecutor::finish() } } -void RemoteQueryExecutor::cancel() +void RemoteQueryExecutor::cancel(std::unique_ptr * read_context) { { std::lock_guard lock(external_tables_mutex); @@ -313,7 +366,7 @@ void RemoteQueryExecutor::cancel() if (!isQueryPending() || hasThrownException()) return; - tryCancel("Cancelling query"); + tryCancel("Cancelling query", read_context); } void RemoteQueryExecutor::sendScalars() @@ -365,7 +418,7 @@ void RemoteQueryExecutor::sendExternalTables() multiplexed_connections->sendExternalTablesData(external_tables_data); } -void RemoteQueryExecutor::tryCancel(const char * reason) +void RemoteQueryExecutor::tryCancel(const char * reason, std::unique_ptr * read_context) { { /// Flag was_cancelled is atomic because it is checked in read(). @@ -375,6 +428,10 @@ void RemoteQueryExecutor::tryCancel(const char * reason) return; was_cancelled = true; + + if (read_context && *read_context) + (*read_context)->cancel(); + multiplexed_connections->sendCancel(); } diff --git a/src/DataStreams/RemoteQueryExecutor.h b/src/DataStreams/RemoteQueryExecutor.h index cec92a5f0e9..46d9d067563 100644 --- a/src/DataStreams/RemoteQueryExecutor.h +++ b/src/DataStreams/RemoteQueryExecutor.h @@ -5,6 +5,9 @@ #include #include #include +#include +#include +#include namespace DB { @@ -20,10 +23,14 @@ using ProgressCallback = std::function; struct BlockStreamProfileInfo; using ProfileInfoCallback = std::function; +class RemoteQueryExecutorReadContext; + /// This class allows one to launch queries on remote replicas of one shard and get results class RemoteQueryExecutor { public: + using ReadContext = RemoteQueryExecutorReadContext; + /// Takes already set connection. RemoteQueryExecutor( Connection & connection, @@ -53,13 +60,17 @@ public: /// Read next block of data. Returns empty block if query is finished. Block read(); + /// Async variant of read. Returns ready block or file descriptor which may be used for polling. + /// ReadContext is an internal read state. Pass empty ptr first time, reuse created one for every call. + std::variant read(std::unique_ptr & read_context); + /// Receive all remain packets and finish query. /// It should be cancelled after read returned empty block. - void finish(); + void finish(std::unique_ptr * read_context = nullptr); /// Cancel query execution. Sends Cancel packet and ignore others. /// This method may be called from separate thread. - void cancel(); + void cancel(std::unique_ptr * read_context = nullptr); /// Get totals and extremes if any. Block getTotals() { return std::move(totals); } @@ -153,13 +164,16 @@ private: void sendExternalTables(); /// If wasn't sent yet, send request to cancel all connections to replicas - void tryCancel(const char * reason); + void tryCancel(const char * reason, std::unique_ptr * read_context); /// Returns true if query was sent bool isQueryPending() const; /// Returns true if exception was thrown bool hasThrownException() const; + + /// Process packet for read and return data block if possible. + std::optional processPacket(Packet packet); }; } diff --git a/src/DataStreams/RemoteQueryExecutorReadContext.h b/src/DataStreams/RemoteQueryExecutorReadContext.h new file mode 100644 index 00000000000..f8c64954b83 --- /dev/null +++ b/src/DataStreams/RemoteQueryExecutorReadContext.h @@ -0,0 +1,272 @@ +#pragma once + +#if defined(OS_LINUX) + +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_READ_FROM_SOCKET; + extern const int CANNOT_OPEN_FILE; + extern const int SOCKET_TIMEOUT; +} + +class RemoteQueryExecutorReadContext +{ +public: + using Self = RemoteQueryExecutorReadContext; + + bool is_read_in_progress = false; + Packet packet; + + std::exception_ptr exception; + FiberStack stack; + boost::context::fiber fiber; + /// This mutex for fiber is needed because fiber could be destroyed in cancel method from another thread. + std::mutex fiber_lock; + + Poco::Timespan receive_timeout; + MultiplexedConnections & connections; + Poco::Net::Socket * last_used_socket = nullptr; + + /// Here we have three descriptors we are going to wait: + /// * socket_fd is a descriptor of connection. It may be changed in case of reading from several replicas. + /// * timer is a timerfd descriptor to manually check socket timeout + /// * pipe_fd is a pipe we use to cancel query and socket polling by executor. + /// We put those descriptors into our own epoll_fd which is used by external executor. + TimerDescriptor timer{CLOCK_MONOTONIC, 0}; + int socket_fd = -1; + int epoll_fd; + int pipe_fd[2]; + + explicit RemoteQueryExecutorReadContext(MultiplexedConnections & connections_) : connections(connections_) + { + epoll_fd = epoll_create(2); + if (-1 == epoll_fd) + throwFromErrno("Cannot create epoll descriptor", ErrorCodes::CANNOT_OPEN_FILE); + + if (-1 == pipe2(pipe_fd, O_NONBLOCK)) + throwFromErrno("Cannot create pipe", ErrorCodes::CANNOT_OPEN_FILE); + + { + epoll_event socket_event; + socket_event.events = EPOLLIN | EPOLLPRI; + socket_event.data.fd = pipe_fd[0]; + + if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, pipe_fd[0], &socket_event)) + throwFromErrno("Cannot add pipe descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE); + } + + { + epoll_event timer_event; + timer_event.events = EPOLLIN | EPOLLPRI; + timer_event.data.fd = timer.getDescriptor(); + + if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, timer_event.data.fd, &timer_event)) + throwFromErrno("Cannot add timer descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE); + } + + auto routine = Routine{connections, *this}; + fiber = boost::context::fiber(std::allocator_arg_t(), stack, std::move(routine)); + } + + void setSocket(Poco::Net::Socket & socket) + { + int fd = socket.impl()->sockfd(); + if (fd == socket_fd) + return; + + epoll_event socket_event; + socket_event.events = EPOLLIN | EPOLLPRI; + socket_event.data.fd = fd; + + if (socket_fd != -1) + { + if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_DEL, socket_fd, &socket_event)) + throwFromErrno("Cannot remove socket descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE); + } + + socket_fd = fd; + + if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, socket_fd, &socket_event)) + throwFromErrno("Cannot add socket descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE); + + receive_timeout = socket.impl()->getReceiveTimeout(); + } + + bool checkTimeout() const + { + try + { + return checkTimeoutImpl(); + } + catch (DB::Exception & e) + { + if (last_used_socket) + e.addMessage(" while reading from socket ({})", last_used_socket->peerAddress().toString()); + throw; + } + } + + bool checkTimeoutImpl() const + { + epoll_event events[3]; + events[0].data.fd = events[1].data.fd = events[2].data.fd = -1; + + /// Wait for epoll_fd will not block if it was polled externally. + int num_events = epoll_wait(epoll_fd, events, 3, 0); + if (num_events == -1) + throwFromErrno("Failed to epoll_wait", ErrorCodes::CANNOT_READ_FROM_SOCKET); + + bool is_socket_ready = false; + bool is_pipe_alarmed = false; + bool has_timer_alarm = false; + + for (int i = 0; i < num_events; ++i) + { + if (events[i].data.fd == socket_fd) + is_socket_ready = true; + if (events[i].data.fd == timer.getDescriptor()) + has_timer_alarm = true; + if (events[i].data.fd == pipe_fd[0]) + is_pipe_alarmed = true; + } + + if (is_pipe_alarmed) + return false; + + if (has_timer_alarm && !is_socket_ready) + { + /// Socket receive timeout. Drain it in case or error, or it may be hide by timeout exception. + timer.drain(); + throw NetException("Timeout exceeded", ErrorCodes::SOCKET_TIMEOUT); + } + + return true; + } + + void setTimer() const + { + /// Did not get packet yet. Init timeout for the next async reading. + timer.reset(); + + if (receive_timeout.totalMicroseconds()) + timer.setRelative(receive_timeout); + } + + bool resumeRoutine() + { + if (is_read_in_progress && !checkTimeout()) + return false; + + { + std::lock_guard guard(fiber_lock); + if (!fiber) + return false; + + fiber = std::move(fiber).resume(); + } + + if (exception) + std::rethrow_exception(std::move(exception)); + + return true; + } + + void cancel() + { + std::lock_guard guard(fiber_lock); + /// It is safe to just destroy fiber - we are not in the process of reading from socket. + boost::context::fiber to_destroy = std::move(fiber); + + /// Send something to pipe to cancel executor waiting. + uint64_t buf = 0; + while (-1 == write(pipe_fd[1], &buf, sizeof(buf))) + { + if (errno == EAGAIN) + break; + + if (errno != EINTR) + throwFromErrno("Cannot write to pipe", ErrorCodes::CANNOT_READ_FROM_SOCKET); + } + } + + ~RemoteQueryExecutorReadContext() + { + /// socket_fd is closed by Poco::Net::Socket + /// timer_fd is closed by TimerDescriptor + close(epoll_fd); + } + + struct Routine + { + MultiplexedConnections & connections; + Self & read_context; + + struct ReadCallback + { + Self & read_context; + Fiber & fiber; + + void operator()(Poco::Net::Socket & socket) + { + try + { + read_context.setSocket(socket); + } + catch (DB::Exception & e) + { + e.addMessage(" while reading from socket ({})", socket.peerAddress().toString()); + throw; + } + + read_context.is_read_in_progress = true; + fiber = std::move(fiber).resume(); + read_context.is_read_in_progress = false; + } + }; + + Fiber operator()(Fiber && sink) const + { + try + { + while (true) + { + read_context.packet = connections.receivePacketUnlocked(ReadCallback{read_context, sink}); + sink = std::move(sink).resume(); + } + } + catch (const boost::context::detail::forced_unwind &) + { + /// This exception is thrown by fiber implementation in case if fiber is being deleted but hasn't exited + /// It should not be caught or it will segfault. + /// Other exceptions must be caught + throw; + } + catch (...) + { + read_context.exception = std::current_exception(); + } + + return std::move(sink); + } + }; +}; +} +#else +namespace DB +{ +class RemoteQueryExecutorReadContext +{ +public: + void cancel() {} +}; + +} +#endif diff --git a/src/Functions/ExtractString.h b/src/Functions/ExtractString.h new file mode 100644 index 00000000000..c4251f8c4a6 --- /dev/null +++ b/src/Functions/ExtractString.h @@ -0,0 +1,166 @@ +#pragma once +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifdef __SSE4_2__ +# include +#endif + +namespace DB +{ +// used by FunctionsStringSimilarity and FunctionsStringHash +// includes extracting ASCII ngram, UTF8 ngram, ASCII word and UTF8 word +template +struct ExtractStringImpl +{ + /// Padding form ColumnsString. It is a number of bytes we can always read starting from pos if pos < end. + static constexpr size_t default_padding = 16; + + /// Functions are read `default_padding - (N - 1)` bytes into the buffer. Window of size N is used. + /// Read copies `N - 1` last bytes from buffer into beginning, and then reads new bytes. + static constexpr size_t buffer_size = default_padding + N - 1; + + // the length of code_points = buffer_size + // pos: the current beginning location that we want to copy data + // end: the end location of the string + static ALWAYS_INLINE size_t readASCIICodePoints(UInt8 * code_points, const char *& pos, const char * end) + { + /// Offset before which we copy some data. + constexpr size_t padding_offset = default_padding - N + 1; + /// We have an array like this for ASCII (N == 4, other cases are similar) + /// |a0|a1|a2|a3|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18| + /// And we copy ^^^^^^^^^^^^^^^ these bytes to the start + /// Actually it is enough to copy 3 bytes, but memcpy for 4 bytes translates into 1 instruction + memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(UInt8)); + /// Now we have an array + /// |a13|a14|a15|a16|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18| + /// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + /// Doing unaligned read of 16 bytes and copy them like above + /// 16 is also chosen to do two `movups`. + /// Such copying allow us to have 3 codepoints from the previous read to produce the 4-grams with them. + memcpy(code_points + (N - 1), pos, default_padding * sizeof(UInt8)); + + if constexpr (CaseInsensitive) + { + /// We really need template lambdas with C++20 to do it inline + unrollLowering(code_points, std::make_index_sequence()); + } + pos += padding_offset; + if (pos > end) + return default_padding - (pos - end); + return default_padding; + } + + // read a ASCII word + static ALWAYS_INLINE inline size_t readOneASCIIWord(PaddedPODArray & word_buf, const char *& pos, const char * end) + { + // jump seperators + while (pos < end && !isAlphaNumericASCII(*pos)) + ++pos; + + // word start from here + const char * word_start = pos; + while (pos < end && isAlphaNumericASCII(*pos)) + ++pos; + + word_buf.assign(word_start, pos); + if (CaseInsensitive) + { + std::transform(word_buf.begin(), word_buf.end(), word_buf.begin(), [](UInt8 c) { return std::tolower(c); }); + } + return word_buf.size(); + } + + static ALWAYS_INLINE inline size_t readUTF8CodePoints(UInt32 * code_points, const char *& pos, const char * end) + { + memcpy(code_points, code_points + default_padding - N + 1, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(UInt32)); + + size_t num = N - 1; + while (num < default_padding && pos < end) + { + code_points[num++] = readOneUTF8Code(pos, end); + } + return num; + } + + // read one UTF8 word from pos to word + static ALWAYS_INLINE inline size_t readOneUTF8Word(PaddedPODArray & word_buf, const char *& pos, const char * end) + { + // jump UTF8 seperator + while (pos < end && isUTF8Sep(*pos)) + ++pos; + word_buf.clear(); + // UTF8 word's character number + while (pos < end && !isUTF8Sep(*pos)) + { + word_buf.push_back(readOneUTF8Code(pos, end)); + } + return word_buf.size(); + } + +private: + template + static ALWAYS_INLINE inline void unrollLowering(Container & cont, const std::index_sequence &) + { + ((cont[Offset + I] = std::tolower(cont[Offset + I])), ...); + } + + // we use ASCII non-alphanum character as UTF8 seperator + static ALWAYS_INLINE inline bool isUTF8Sep(const UInt8 c) { return c < 128 && !isAlphaNumericASCII(c); } + + // read one UTF8 character and return it + static ALWAYS_INLINE inline UInt32 readOneUTF8Code(const char *& pos, const char * end) + { + size_t length = UTF8::seqLength(*pos); + + if (pos + length > end) + length = end - pos; + UInt32 res; + switch (length) + { + case 1: + res = 0; + memcpy(&res, pos, 1); + break; + case 2: + res = 0; + memcpy(&res, pos, 2); + break; + case 3: + res = 0; + memcpy(&res, pos, 3); + break; + default: + memcpy(&res, pos, 4); + } + + if constexpr (CaseInsensitive) + { + switch (length) + { + case 4: + res &= ~(1u << (5 + 3 * CHAR_BIT)); + [[fallthrough]]; + case 3: + res &= ~(1u << (5 + 2 * CHAR_BIT)); + [[fallthrough]]; + case 2: + res &= ~(1u); + res &= ~(1u << (5 + CHAR_BIT)); + [[fallthrough]]; + default: + res &= ~(1u << 5); + } + } + pos += length; + return res; + } +}; +} diff --git a/src/Functions/FunctionsStringHash.cpp b/src/Functions/FunctionsStringHash.cpp new file mode 100644 index 00000000000..d57be67ef7f --- /dev/null +++ b/src/Functions/FunctionsStringHash.cpp @@ -0,0 +1,626 @@ +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +struct Hash +{ + static UInt64 crc32u64(UInt64 crc [[maybe_unused]], UInt64 val [[maybe_unused]]) + { +#ifdef __SSE4_2__ + return _mm_crc32_u64(crc, val); +#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + return __crc32cd(crc, val); +#else + throw Exception("String hash is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED); +#endif + } + + static UInt64 crc32u32(UInt64 crc [[maybe_unused]], UInt32 val [[maybe_unused]]) + { +#ifdef __SSE4_2__ + return _mm_crc32_u32(crc, val); +#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + return __crc32cw(crc, val); +#else + throw Exception("String hash is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED); +#endif + } + + static UInt64 crc32u8(UInt64 crc [[maybe_unused]], UInt8 val [[maybe_unused]]) + { +#ifdef __SSE4_2__ + return _mm_crc32_u8(crc, val); +#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + return __crc32cb(crc, val); +#else + throw Exception("String hash is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED); +#endif + } + + static ALWAYS_INLINE inline UInt64 ngramASCIIHash(const UInt8 * code_points) + { + return crc32u64(-1ULL, unalignedLoad(code_points)); + } + + static ALWAYS_INLINE inline UInt64 ngramUTF8Hash(const UInt32 * code_points) + { + UInt64 crc = -1ULL; + crc = crc32u64(crc, code_points[0]); + crc = crc32u64(crc, code_points[1]); + crc = crc32u64(crc, code_points[2]); + return crc; + } + + static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt64 * hashes, size_t size, size_t offset) + { + UInt64 crc1 = -1ULL; + UInt64 crc2 = -1ULL; + + for (size_t i = offset; i < size; i += 2) + crc1 = crc32u64(crc1, hashes[i]); + for (size_t i = offset + 1; i < size; i += 2) + crc2 = crc32u64(crc2, hashes[i]); + + if ((size - offset) & 1) + { + for (size_t i = 0; i < offset; i += 2) + crc2 = crc32u64(crc2, hashes[i]); + for (size_t i = 1; i < offset; i += 2) + crc1 = crc32u64(crc1, hashes[i]); + } + else + { + for (size_t i = 0; i < offset; i += 2) + crc1 = crc32u64(crc1, hashes[i]); + for (size_t i = 1; i < offset; i += 2) + crc2 = crc32u64(crc2, hashes[i]); + } + + return crc1 | (crc2 << 32u); + } + + static ALWAYS_INLINE inline UInt64 hashSum(const UInt8 * hashes [[maybe_unused]], size_t K [[maybe_unused]]) + { + UInt64 crc1 = -1ULL; + UInt64 crc2 = -1ULL; + + for (size_t i = 0; i < K; i += 2) + crc1 = crc32u8(crc1, hashes[i]); + for (size_t i = 1; i < K; i += 2) + crc2 = crc32u8(crc2, hashes[i]); + + return crc1 | (crc2 << 32u); + } + + static ALWAYS_INLINE inline UInt64 hashSum(const UInt32 * hashes [[maybe_unused]], size_t K [[maybe_unused]]) + { + UInt64 crc1 = -1ULL; + UInt64 crc2 = -1ULL; + + for (size_t i = 0; i < K; i += 2) + crc1 = crc32u32(crc1, hashes[i]); + for (size_t i = 1; i < K; i += 2) + crc2 = crc32u32(crc2, hashes[i]); + + return crc1 | (crc2 << 32u); + } + + static ALWAYS_INLINE inline UInt64 hashSum(const UInt64 * hashes, size_t K) + { + UInt64 crc1 = -1ULL; + UInt64 crc2 = -1ULL; + + for (size_t i = 0; i < K; i += 2) + crc1 = crc32u64(crc1, hashes[i]); + for (size_t i = 1; i < K; i += 2) + crc2 = crc32u64(crc2, hashes[i]); + + return crc1 | (crc2 << 32u); + } +}; + +// Simhash String -> UInt64 +// N: the length of ngram or words shingles +// CodePoint: UInt8(ASCII) or UInt32(UTF8) +// UTF8: means ASCII or UTF8, these two parameters CodePoint and UTF8 can only be (UInt8, false) or (UInt32, true) +// Ngram: means ngram(true) or words shingles(false) +// CaseInsensitive: means should we consider about letter case or not +template +struct SimhashImpl +{ + using StrOp = ExtractStringImpl; + // we made an assumption that the size of one word cann't exceed 128, which may not true + // if some word's size exceed 128, it would be cut up to several word + static constexpr size_t max_string_size = 1u << 15; + static constexpr size_t simultaneously_codepoints_num = StrOp::buffer_size; + + // Simhash ngram calculate function: String ->UInt64 + // this function extracting ngram from input string, and maintain a 64-dimensions vector + // for each ngram, calculate a 64 bit hash value, and update the vector according the hash value + // finally return a 64 bit value(UInt64), i'th bit is 1 means vector[i] > 0, otherwise, vector[i] < 0 + static ALWAYS_INLINE inline UInt64 ngramCalculateHashValue( + const char * data, + size_t size, + size_t (*read_code_points)(CodePoint *, const char *&, const char *), + UInt64 (*hash_functor)(const CodePoint *)) + { + const char * start = data; + const char * end = data + size; + // fingerprint vector, all dimensions initialized to zero at the first + Int64 finger_vec[64] = {}; + CodePoint cp[simultaneously_codepoints_num] = {}; + + size_t found = read_code_points(cp, start, end); + size_t iter = N - 1; + + do + { + for (; iter + N <= found; ++iter) + { + // for each ngram, we can calculate an 64 bit hash + // then update finger_vec according to this hash value + // if the i'th bit is 1, finger_vec[i] plus 1, otherwise minus 1 + UInt64 hash_value = hash_functor(cp + iter); + std::bitset<64> bits(hash_value); + for (size_t i = 0; i < 64; ++i) + { + finger_vec[i] += ((bits.test(i)) ? 1 : -1); + } + } + iter = 0; + } while (start < end && (found = read_code_points(cp, start, end))); + + // finally, we return a 64 bit value according to finger_vec + // if finger_vec[i] > 0, the i'th bit of the value is 1, otherwise 0 + std::bitset<64> res_bit(0u); + for (size_t i = 0; i < 64; ++i) + { + if (finger_vec[i] > 0) + res_bit.set(i); + } + return res_bit.to_ullong(); + } + + // Simhash word shingle calculate funtion: String -> UInt64 + // this function extracting n word shingle from input string, and maintain a 64-dimensions vector as well + // for each word shingle, calculate a 64 bit hash value, and update the vector according the hash value + // finally return a 64 bit value(UInt64), i'th bit is 1 means vector[i] > 0, otherwise, vector[i] < 0 + // + // word shingle hash value calculate: + // 1. at the first, extracts N word shingles and calculate N hash values, store into an array, use this N hash values + // to calculate the first word shingle hash value + // 2. next, we extrac one word each time, and calculate a new hash value of the new word,then use the latest N hash + // values to caculate the next word shingle hash value + static ALWAYS_INLINE inline UInt64 wordShinglesCalculateHashValue( + const char * data, + size_t size, + size_t (*read_one_word)(PaddedPODArray &, const char *&, const char *), + UInt64 (*hash_functor)(const UInt64 *, size_t, size_t)) + { + const char * start = data; + const char * end = data + size; + + // Also, a 64 bit vector initialized to zero + Int64 finger_vec[64] = {}; + // a array to store N word hash values + UInt64 nword_hashes[N] = {}; + // word buffer to store one word + PaddedPODArray word_buf; + // get first word shingle + for (size_t i = 0; i < N && start < end; ++i) + { + read_one_word(word_buf, start, end); + if (!word_buf.empty()) + { + // for each word, calculate a hash value and stored into the array + nword_hashes[i++] = Hash::hashSum(word_buf.data(), word_buf.size()); + } + } + + // calculate the first word shingle hash value + UInt64 hash_value = hash_functor(nword_hashes, N, 0); + std::bitset<64> first_bits(hash_value); + for (size_t i = 0; i < 64; ++i) + { + finger_vec[i] += ((first_bits.test(i)) ? 1 : -1); + } + + size_t offset = 0; + while (start < end && read_one_word(word_buf, start, end)) + { + // we need to store the new word hash value to the oldest location. + // for example, N = 5, array |a0|a1|a2|a3|a4|, now , a0 is the oldest location, + // so we need to store new word hash into location of a0, then ,this array become + // |a5|a1|a2|a3|a4|, next time, a1 become the oldest location, we need to store new + // word hash value into locaion of a1, then array become |a5|a6|a2|a3|a4| + nword_hashes[offset] = Hash::hashSum(word_buf.data(), word_buf.size()); + offset = (offset + 1) % N; + // according to the word hash storation way, in order to not lose the word shingle's + // sequence information, when calculation word shingle hash value, we need provide the offset + // inforation, which is the offset of the first word's hash value of the word shingle + hash_value = hash_functor(nword_hashes, N, offset); + std::bitset<64> bits(hash_value); + for (size_t i = 0; i < 64; ++i) + { + finger_vec[i] += ((bits.test(i)) ? 1 : -1); + } + } + + std::bitset<64> res_bit(0u); + for (size_t i = 0; i < 64; ++i) + { + if (finger_vec[i] > 0) + res_bit.set(i); + } + return res_bit.to_ullong(); + } + + static void apply(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray & res) + { + for (size_t i = 0; i < offsets.size(); ++i) + { + const char * one_data = reinterpret_cast(&data[offsets[i - 1]]); + const size_t data_size = offsets[i] - offsets[i - 1] - 1; + if (data_size <= max_string_size) + { + if constexpr (Ngram) + { + if constexpr (!UTF8) + res[i] = ngramCalculateHashValue(one_data, data_size, StrOp::readASCIICodePoints, Hash::ngramASCIIHash); + else + res[i] = ngramCalculateHashValue(one_data, data_size, StrOp::readUTF8CodePoints, Hash::ngramUTF8Hash); + } + else + { + if constexpr (!UTF8) + res[i] = wordShinglesCalculateHashValue(one_data, data_size, StrOp::readOneASCIIWord, Hash::wordShinglesHash); + else + res[i] = wordShinglesCalculateHashValue(one_data, data_size, StrOp::readOneUTF8Word, Hash::wordShinglesHash); + } + } + else + res[i] = -1ull; + } + } +}; + +template +class FixedHeap +{ +public: + FixedHeap() = delete; + + explicit FixedHeap(F f_) : f(f_), data_t(std::make_shared>(K, v)) + { + std::make_heap(data_t->begin(), data_t->end(), f); + } + + void insertAndReplace(UInt64 new_v) + { + data_t->push_back(new_v); + std::push_heap(data_t->begin(), data_t->end(), f); + std::pop_heap(data_t->begin(), data_t->end(), f); + data_t->pop_back(); + } + + const UInt64 * data() { return data_t->data(); } + +private: + F f; + std::shared_ptr> data_t; +}; + + +// Minhash: String -> Tuple(UInt64, UInt64) +// for each string, we extract ngram or word shingle, +// for each ngram or word shingle, calculate a hash value, +// then we take the K minimum hash values to calculate a hashsum, +// and take the K maximum hash values to calculate another hashsum, +// return this two hashsum: Tuple(hashsum1, hashsum2) +// +// N: the length of ngram or words shingles +// K: the number of minimum hashes and maximum hashes that we keep +// CodePoint: UInt8(ASCII) or UInt32(UTF8) +// UTF8: means ASCII or UTF8, these two parameters CodePoint and UTF8 can only be (UInt8, false) or (UInt32, true) +// Ngram: means ngram(true) or words shingles(false) +// CaseInsensitive: means should we consider about letter case or not +template +struct MinhashImpl +{ + using Less = std::less; + using Greater = std::greater; + using MaxHeap = FixedHeap, K, -1ULL>; + using MinHeap = FixedHeap, K, 0>; + using StrOp = ExtractStringImpl; + static constexpr size_t max_string_size = 1u << 15; + static constexpr size_t simultaneously_codepoints_num = StrOp::buffer_size; + + // Minhash ngram calculate function, String -> Tuple(UInt64, UInt64) + // we extract ngram from input string, and calculate a hash value for each ngram + // then we take the K minimum hash values to calculate a hashsum, + // and take the K maximum hash values to calculate another hashsum, + // return this two hashsum: Tuple(hashsum1, hashsum2) + static ALWAYS_INLINE inline std::tuple ngramCalculateHashValue( + const char * data, + size_t size, + size_t (*read_code_points)(CodePoint *, const char *&, const char *), + UInt64 (*hash_functor)(const CodePoint *)) + { + const char * start = data; + const char * end = data + size; + // we just maintain the K minimu and K maximum hash values + MaxHeap k_minimum_hashes(Less{}); + MinHeap k_maximum_hashes(Greater{}); + CodePoint cp[simultaneously_codepoints_num] = {}; + + size_t found = read_code_points(cp, start, end); + size_t iter = N - 1; + + do + { + for (; iter + N <= found; ++iter) + { + auto new_hash = hash_functor(cp + iter); + // insert the new hash value into array used to store K minimum value + // and K maximum value + k_minimum_hashes.insertAndReplace(new_hash); + k_maximum_hashes.insertAndReplace(new_hash); + } + iter = 0; + } while (start < end && (found = read_code_points(cp, start, end))); + + // calculate hashsum of the K minimum hash values and K maximum hash values + UInt64 res1 = Hash::hashSum(k_minimum_hashes.data(), K); + UInt64 res2 = Hash::hashSum(k_maximum_hashes.data(), K); + return std::make_tuple(res1, res2); + } + + // Minhash word shingle hash value calculate function: String ->Tuple(UInt64, UInt64) + // for each word shingle, we calculate a hash value, but in fact, we just maintain the + // K minimum and K maximum hash value + static ALWAYS_INLINE inline std::tuple wordShinglesCalculateHashValue( + const char * data, + size_t size, + size_t (*read_one_word)(PaddedPODArray &, const char *&, const char *), + UInt64 (*hash_functor)(const UInt64 *, size_t, size_t)) + { + const char * start = data; + const char * end = start + size; + // also we just store the K minimu and K maximum hash values + MaxHeap k_minimum_hashes(Less{}); + MinHeap k_maximum_hashes(Greater{}); + // array to store n word hashes + UInt64 nword_hashes[N] = {}; + // word buffer to store one word + PaddedPODArray word_buf; + // how word shingle hash value calculation and word hash storation is same as we + // have descripted in Simhash wordShinglesCalculateHashValue function + for (size_t i = 0; i < N && start < end; ++i) + { + read_one_word(word_buf, start, end); + if (!word_buf.empty()) + { + nword_hashes[i++] = Hash::hashSum(word_buf.data(), word_buf.size()); + } + } + + auto new_hash = hash_functor(nword_hashes, N, 0); + k_minimum_hashes.insertAndReplace(new_hash); + k_maximum_hashes.insertAndReplace(new_hash); + + size_t offset = 0; + while (start < end && read_one_word(word_buf, start, end)) + { + nword_hashes[offset] = Hash::hashSum(word_buf.data(), word_buf.size()); + offset = (offset + 1) % N; + new_hash = hash_functor(nword_hashes, N, offset); + k_minimum_hashes.insertAndReplace(new_hash); + k_maximum_hashes.insertAndReplace(new_hash); + } + + // calculate hashsum + UInt64 res1 = Hash::hashSum(k_minimum_hashes.data(), K); + UInt64 res2 = Hash::hashSum(k_maximum_hashes.data(), K); + return std::make_tuple(res1, res2); + } + + static void apply( + const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, + PaddedPODArray & res1, + PaddedPODArray & res2) + { + for (size_t i = 0; i < offsets.size(); ++i) + { + const char * one_data = reinterpret_cast(&data[offsets[i - 1]]); + const size_t data_size = offsets[i] - offsets[i - 1] - 1; + if (data_size <= max_string_size) + { + if constexpr (Ngram) + { + if constexpr (!UTF8) + std::tie(res1[i], res2[i]) = ngramCalculateHashValue(one_data, data_size, StrOp::readASCIICodePoints, Hash::ngramASCIIHash); + else + std::tie(res1[i], res2[i]) = ngramCalculateHashValue(one_data, data_size, StrOp::readUTF8CodePoints, Hash::ngramUTF8Hash); + } + else + { + if constexpr (!UTF8) + std::tie(res1[i], res2[i]) = wordShinglesCalculateHashValue(one_data, data_size, StrOp::readOneASCIIWord, Hash::wordShinglesHash); + else + std::tie(res1[i], res2[i]) = wordShinglesCalculateHashValue(one_data, data_size, StrOp::readOneUTF8Word, Hash::wordShinglesHash); + } + } + else + std::tie(res1[i], res2[i]) = std::make_tuple(-1ull, -1ull); + } + } +}; + +struct NameNgramSimhash +{ + static constexpr auto name = "ngramSimhash"; +}; + +struct NameNgramSimhashCaseInsensitive +{ + static constexpr auto name = "ngramSimhashCaseInsensitive"; +}; + +struct NameNgramSimhashUTF8 +{ + static constexpr auto name = "ngramSimhashUTF8"; +}; + +struct NameNgramSimhashCaseInsensitiveUTF8 +{ + static constexpr auto name = "ngramSimhashCaseInsensitiveUTF8"; +}; + +struct NameWordShingleSimhash +{ + static constexpr auto name = "wordShingleSimhash"; +}; + +struct NameWordShingleSimhashCaseInsensitive +{ + static constexpr auto name = "wordShingleSimhashCaseInsensitive"; +}; + +struct NameWordShingleSimhashUTF8 +{ + static constexpr auto name = "wordShingleSimhashUTF8"; +}; + +struct NameWordShingleSimhashCaseInsensitiveUTF8 +{ + static constexpr auto name = "wordShingleSimhashCaseInsensitiveUTF8"; +}; + +struct NameNgramMinhash +{ + static constexpr auto name = "ngramMinhash"; +}; + +struct NameNgramMinhashCaseInsensitive +{ + static constexpr auto name = "ngramMinhashCaseInsensitive"; +}; + +struct NameNgramMinhashUTF8 +{ + static constexpr auto name = "ngramMinhashUTF8"; +}; + +struct NameNgramMinhashCaseInsensitiveUTF8 +{ + static constexpr auto name = "ngramMinhashCaseInsensitiveUTF8"; +}; + +struct NameWordShingleMinhash +{ + static constexpr auto name = "wordShingleMinhash"; +}; + +struct NameWordShingleMinhashCaseInsensitive +{ + static constexpr auto name = "wordShingleMinhashCaseInsensitive"; +}; + +struct NameWordShingleMinhashUTF8 +{ + static constexpr auto name = "wordShingleMinhashUTF8"; +}; + +struct NameWordShingleMinhashCaseInsensitiveUTF8 +{ + static constexpr auto name = "wordShingleMinhashCaseInsensitiveUTF8"; +}; + +// Simhash +using FunctionNgramSimhash = FunctionsStringHash, NameNgramSimhash, true>; + +using FunctionNgramSimhashCaseInsensitive + = FunctionsStringHash, NameNgramSimhashCaseInsensitive, true>; + +using FunctionNgramSimhashUTF8 = FunctionsStringHash, NameNgramSimhashUTF8, true>; + +using FunctionNgramSimhashCaseInsensitiveUTF8 + = FunctionsStringHash, NameNgramSimhashCaseInsensitiveUTF8, true>; + +using FunctionWordShingleSimhash = FunctionsStringHash, NameWordShingleSimhash, true>; + +using FunctionWordShingleSimhashCaseInsensitive + = FunctionsStringHash, NameWordShingleSimhashCaseInsensitive, true>; + +using FunctionWordShingleSimhashUTF8 = FunctionsStringHash, NameWordShingleSimhashUTF8, true>; + +using FunctionWordShingleSimhashCaseInsensitiveUTF8 + = FunctionsStringHash, NameWordShingleSimhashCaseInsensitiveUTF8, true>; + +// Minhash +using FunctionNgramMinhash = FunctionsStringHash, NameNgramMinhash, false>; + +using FunctionNgramMinhashCaseInsensitive + = FunctionsStringHash, NameNgramMinhashCaseInsensitive, false>; + +using FunctionNgramMinhashUTF8 = FunctionsStringHash, NameNgramMinhashUTF8, false>; + +using FunctionNgramMinhashCaseInsensitiveUTF8 + = FunctionsStringHash, NameNgramMinhashCaseInsensitiveUTF8, false>; + +using FunctionWordShingleMinhash = FunctionsStringHash, NameWordShingleMinhash, false>; + +using FunctionWordShingleMinhashCaseInsensitive + = FunctionsStringHash, NameWordShingleMinhashCaseInsensitive, false>; + +using FunctionWordShingleMinhashUTF8 + = FunctionsStringHash, NameWordShingleMinhashUTF8, false>; + +using FunctionWordShingleMinhashCaseInsensitiveUTF8 + = FunctionsStringHash, NameWordShingleMinhashCaseInsensitiveUTF8, false>; + +void registerFunctionsStringHash(FunctionFactory & factory) +{ + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); +} +} + diff --git a/src/Functions/FunctionsStringHash.h b/src/Functions/FunctionsStringHash.h new file mode 100644 index 00000000000..979f2bd8a9d --- /dev/null +++ b/src/Functions/FunctionsStringHash.h @@ -0,0 +1,83 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + +// FunctionStringHash +// Simhash: String -> UInt64 +// Minhash: String -> (UInt64, UInt64) +template +class FunctionsStringHash : public IFunction +{ +public: + static constexpr auto name = Name::name; + + static FunctionPtr create(const Context &) { return std::make_shared(); } + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 1; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!isString(arguments[0])) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Function {} expect single String argument, got {}", getName(), arguments[0]->getName()); + + auto type = std::make_shared(); + if constexpr (is_simhash) + return type; + + return std::make_shared(DataTypes{type, type}); + } + + bool useDefaultImplementationForConstants() const override { return true; } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override + { + const ColumnPtr & column = arguments[0].column; + + if constexpr (is_simhash) + { + // non const string, const case is handled by useDefaultImplementationForConstants. + auto col_res = ColumnVector::create(); + auto & vec_res = col_res->getData(); + vec_res.resize(column->size()); + const ColumnString * col_str_vector = checkAndGetColumn(&*column); + Impl::apply(col_str_vector->getChars(), col_str_vector->getOffsets(), vec_res); + return col_res; + } + else // Min hash + { + // non const string + auto col_h1 = ColumnVector::create(); + auto col_h2 = ColumnVector::create(); + auto & vec_h1 = col_h1->getData(); + auto & vec_h2 = col_h2->getData(); + vec_h1.resize(column->size()); + vec_h2.resize(column->size()); + const ColumnString * col_str_vector = checkAndGetColumn(&*column); + Impl::apply(col_str_vector->getChars(), col_str_vector->getOffsets(), vec_h1, vec_h2); + MutableColumns tuple_columns; + tuple_columns.emplace_back(std::move(col_h1)); + tuple_columns.emplace_back(std::move(col_h2)); + return ColumnTuple::create(std::move(tuple_columns)); + } + } +}; +} + diff --git a/src/Functions/bitHammingDistance.cpp b/src/Functions/bitHammingDistance.cpp new file mode 100644 index 00000000000..9b9ff5b6c07 --- /dev/null +++ b/src/Functions/bitHammingDistance.cpp @@ -0,0 +1,160 @@ +#include +#include +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + + +template +struct BitHammingDistanceImpl +{ + using ResultType = UInt8; + + static void NO_INLINE vectorVector(const PaddedPODArray & a, const PaddedPODArray & b, PaddedPODArray & c) + { + size_t size = a.size(); + for (size_t i = 0; i < size; ++i) + c[i] = apply(a[i], b[i]); + } + + static void NO_INLINE vectorConstant(const PaddedPODArray & a, B b, PaddedPODArray & c) + { + size_t size = a.size(); + for (size_t i = 0; i < size; ++i) + c[i] = apply(a[i], b); + } + + static void NO_INLINE constantVector(A a, const PaddedPODArray & b, PaddedPODArray & c) + { + size_t size = b.size(); + for (size_t i = 0; i < size; ++i) + c[i] = apply(a, b[i]); + } + +private: + static inline UInt8 apply(UInt64 a, UInt64 b) + { + UInt64 res = a ^ b; + return __builtin_popcountll(res); + } +}; + +template +bool castType(const IDataType * type, F && f) +{ + return castTypeToEither< + DataTypeInt8, + DataTypeInt16, + DataTypeInt32, + DataTypeInt64, + DataTypeUInt8, + DataTypeUInt16, + DataTypeUInt32, + DataTypeUInt64>(type, std::forward(f)); +} + +template +static bool castBothTypes(const IDataType * left, const IDataType * right, F && f) +{ + return castType(left, [&](const auto & left_) { return castType(right, [&](const auto & right_) { return f(left_, right_); }); }); +} + +// bitHammingDistance function: (Integer, Integer) -> UInt8 +class FunctionBitHammingDistance : public IFunction +{ +public: + static constexpr auto name = "bitHammingDistance"; + using ResultType = UInt8; + static FunctionPtr create(const Context &) { return std::make_shared(); } + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 2; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!isInteger(arguments[0])) + throw Exception( + "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + if (!isInteger(arguments[1])) + throw Exception( + "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + return std::make_shared(); + } + + bool useDefaultImplementationForConstants() const override { return true; } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + const auto * left_generic = arguments[0].type.get(); + const auto * right_generic = arguments[1].type.get(); + ColumnPtr result_column; + bool valid = castBothTypes(left_generic, right_generic, [&](const auto & left, const auto & right) + { + using LeftDataType = std::decay_t; + using RightDataType = std::decay_t; + using T0 = typename LeftDataType::FieldType; + using T1 = typename RightDataType::FieldType; + using ColVecT0 = ColumnVector; + using ColVecT1 = ColumnVector; + using ColVecResult = ColumnVector; + + using OpImpl = BitHammingDistanceImpl; + + const auto * const col_left_raw = arguments[0].column.get(); + const auto * const col_right_raw = arguments[1].column.get(); + + typename ColVecResult::MutablePtr col_res = nullptr; + col_res = ColVecResult::create(); + + auto & vec_res = col_res->getData(); + vec_res.resize(input_rows_count); + + if (auto col_left_const = checkAndGetColumnConst(col_left_raw)) + { + if (auto col_right = checkAndGetColumn(col_right_raw)) + { + // constant integer - non-constant integer + OpImpl::constantVector(col_left_const->template getValue(), col_right->getData(), vec_res); + } + else + return false; + } + else if (auto col_left = checkAndGetColumn(col_left_raw)) + { + if (auto col_right = checkAndGetColumn(col_right_raw)) + // non-constant integer - non-constant integer + OpImpl::vectorVector(col_left->getData(), col_right->getData(), vec_res); + else if (auto col_right_const = checkAndGetColumnConst(col_right_raw)) + // non-constant integer - constant integer + OpImpl::vectorConstant(col_left->getData(), col_right_const->template getValue(), vec_res); + else + return false; + } + else + return false; + + result_column = std::move(col_res); + return true; + }); + if (!valid) + throw Exception(getName() + "'s arguments do not match the expected data types", ErrorCodes::ILLEGAL_COLUMN); + + return result_column; + } +}; + +void registerFunctionBitHammingDistance(FunctionFactory & factory) +{ + factory.registerFunction(); +} +} diff --git a/src/Functions/registerFunctions.cpp b/src/Functions/registerFunctions.cpp index 3f75746f861..d827cc40a86 100644 --- a/src/Functions/registerFunctions.cpp +++ b/src/Functions/registerFunctions.cpp @@ -42,7 +42,9 @@ void registerFunctionsNull(FunctionFactory &); void registerFunctionsJSON(FunctionFactory &); void registerFunctionsConsistentHashing(FunctionFactory & factory); void registerFunctionsUnixTimestamp64(FunctionFactory & factory); - +void registerFunctionBitHammingDistance(FunctionFactory & factory); +void registerFunctionTupleHammingDistance(FunctionFactory & factory); +void registerFunctionsStringHash(FunctionFactory & factory); #if !defined(ARCADIA_BUILD) void registerFunctionBayesAB(FunctionFactory &); #endif @@ -57,7 +59,6 @@ void registerFunctionAESDecryptMysql(FunctionFactory & factory); #endif - void registerFunctions() { auto & factory = FunctionFactory::instance(); @@ -99,6 +100,9 @@ void registerFunctions() registerFunctionsIntrospection(factory); registerFunctionsConsistentHashing(factory); registerFunctionsUnixTimestamp64(factory); + registerFunctionBitHammingDistance(factory); + registerFunctionTupleHammingDistance(factory); + registerFunctionsStringHash(factory); #if !defined(ARCADIA_BUILD) registerFunctionBayesAB(factory); diff --git a/src/Functions/tupleHammingDistance.cpp b/src/Functions/tupleHammingDistance.cpp new file mode 100644 index 00000000000..67d5f73065b --- /dev/null +++ b/src/Functions/tupleHammingDistance.cpp @@ -0,0 +1,220 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + +template +struct TupleHammingDistanceImpl +{ + using ResultType = UInt8; + + static void NO_INLINE vectorVector( + const PaddedPODArray & a1, + const PaddedPODArray & b1, + const PaddedPODArray & a2, + const PaddedPODArray & b2, + PaddedPODArray & c) + { + size_t size = a1.size(); + for (size_t i = 0; i < size; ++i) + c[i] = apply(a1[i], a2[i]) + apply(b1[i], b2[i]); + } + + static void NO_INLINE + vectorConstant(const PaddedPODArray & a1, const PaddedPODArray & b1, UInt64 a2, UInt64 b2, PaddedPODArray & c) + { + size_t size = a1.size(); + for (size_t i = 0; i < size; ++i) + c[i] = apply(a1[i], a2) + apply(b1[i], b2); + } + + static void NO_INLINE + constantVector(UInt64 a1, UInt64 b1, const PaddedPODArray & a2, const PaddedPODArray & b2, PaddedPODArray & c) + { + size_t size = a2.size(); + for (size_t i = 0; i < size; ++i) + c[i] = apply(a1, a2[i]) + apply(b1, b2[i]); + } + + static ResultType constantConstant(UInt64 a1, UInt64 b1, UInt64 a2, UInt64 b2) { return apply(a1, a2) + apply(b1, b2); } + +private: + static inline UInt8 apply(UInt64 a, UInt64 b) { return a != b; } +}; + +template +bool castType(const IDataType * type, F && f) +{ + return castTypeToEither< + DataTypeInt8, + DataTypeInt16, + DataTypeInt32, + DataTypeInt64, + DataTypeUInt8, + DataTypeUInt16, + DataTypeUInt32, + DataTypeUInt64>(type, std::forward(f)); +} + +template +static bool castBothTypes(const IDataType * left, const IDataType * right, F && f) +{ + return castType(left, [&](const auto & left_) { return castType(right, [&](const auto & right_) { return f(left_, right_); }); }); +} + +// tupleHammingDistance function: (Tuple(Integer, Integer), Tuple(Integer, Integer))->0/1/2 +// in order to avoid code bloating, for non-constant tuple, we make sure that the elements +// in the tuple should have same data type, and for constant tuple, elements can be any integer +// data type, we cast all of them into UInt64 +class FunctionTupleHammingDistance : public IFunction +{ +public: + static constexpr auto name = "tupleHammingDistance"; + using ResultType = UInt8; + static FunctionPtr create(const Context &) { return std::make_shared(); } + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 2; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!isTuple(arguments[0])) + throw Exception( + "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + if (!isTuple(arguments[1])) + throw Exception( + "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + return std::make_shared(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + const ColumnWithTypeAndName & arg1 = arguments[0]; + const ColumnWithTypeAndName & arg2 = arguments[1]; + const DataTypeTuple & type1 = static_cast(*arg1.type); + const DataTypeTuple & type2 = static_cast(*arg2.type); + const auto & left_elems = type1.getElements(); + const auto & right_elems = type2.getElements(); + if (left_elems.size() != 2 || right_elems.size() != 2) + throw Exception( + "Illegal column of arguments of function " + getName() + ", tuple should have exactly two elements.", + ErrorCodes::ILLEGAL_COLUMN); + + ColumnPtr result_column; + + bool valid = castBothTypes(left_elems[0].get(), right_elems[0].get(), [&](const auto & left, const auto & right) + { + using LeftDataType = std::decay_t; + using RightDataType = std::decay_t; + using T0 = typename LeftDataType::FieldType; + using T1 = typename RightDataType::FieldType; + using ColVecT0 = ColumnVector; + using ColVecT1 = ColumnVector; + using ColVecResult = ColumnVector; + + using OpImpl = TupleHammingDistanceImpl; + + // we can not useDefaultImplementationForConstants, + // because with that, tupleHammingDistance((10, 300), (10, 20)) does not work, + // since 10 has data type UInt8, and 300 has data type UInt16 + if (const ColumnConst * const_col_left = checkAndGetColumnConst(arg1.column.get())) + { + if (const ColumnConst * const_col_right = checkAndGetColumnConst(arg2.column.get())) + { + auto cols1 = convertConstTupleToConstantElements(*const_col_left); + auto cols2 = convertConstTupleToConstantElements(*const_col_right); + Field a1, b1, a2, b2; + cols1[0]->get(0, a1); + cols1[1]->get(0, b1); + cols2[0]->get(0, a2); + cols2[1]->get(0, b2); + auto res = OpImpl::constantConstant(a1.get(), b1.get(), a2.get(), b2.get()); + result_column = DataTypeUInt8().createColumnConst(const_col_left->size(), toField(res)); + return true; + } + } + + typename ColVecResult::MutablePtr col_res = nullptr; + col_res = ColVecResult::create(); + auto & vec_res = col_res->getData(); + vec_res.resize(input_rows_count); + // constant tuple - non-constant tuple + if (const ColumnConst * const_col_left = checkAndGetColumnConst(arg1.column.get())) + { + if (const ColumnTuple * col_right = typeid_cast(arg2.column.get())) + { + auto const_cols = convertConstTupleToConstantElements(*const_col_left); + Field a1, b1; + const_cols[0]->get(0, a1); + const_cols[1]->get(0, b1); + auto col_r1 = checkAndGetColumn(&col_right->getColumn(0)); + auto col_r2 = checkAndGetColumn(&col_right->getColumn(1)); + if (col_r1 && col_r2) + OpImpl::constantVector(a1.get(), b1.get(), col_r1->getData(), col_r2->getData(), vec_res); + else + return false; + } + else + return false; + } + else if (const ColumnTuple * col_left = typeid_cast(arg1.column.get())) + { + auto col_l1 = checkAndGetColumn(&col_left->getColumn(0)); + auto col_l2 = checkAndGetColumn(&col_left->getColumn(1)); + if (col_l1 && col_l2) + { + // non-constant tuple - constant tuple + if (const ColumnConst * const_col_right = checkAndGetColumnConst(arg2.column.get())) + { + auto const_cols = convertConstTupleToConstantElements(*const_col_right); + Field a2, b2; + const_cols[0]->get(0, a2); + const_cols[1]->get(0, b2); + OpImpl::vectorConstant(col_l1->getData(), col_l2->getData(), a2.get(), a2.get(), vec_res); + } + // non-constant tuple - non-constant tuple + else if (const ColumnTuple * col_right = typeid_cast(arg2.column.get())) + { + auto col_r1 = checkAndGetColumn(&col_right->getColumn(0)); + auto col_r2 = checkAndGetColumn(&col_right->getColumn(1)); + if (col_r1 && col_r2) + OpImpl::vectorVector(col_l1->getData(), col_l2->getData(), col_r1->getData(), col_r2->getData(), vec_res); + else + return false; + } + else + return false; + } + else + return false; + } + else + return false; + result_column = std::move(col_res); + return true; + }); + if (!valid) + throw Exception(getName() + "'s arguments do not match the expected data types", ErrorCodes::ILLEGAL_COLUMN); + + return result_column; + } +}; + +void registerFunctionTupleHammingDistance(FunctionFactory & factory) +{ + factory.registerFunction(); +} +} diff --git a/src/Functions/ya.make b/src/Functions/ya.make index 6e5d832db77..7e64deef64d 100644 --- a/src/Functions/ya.make +++ b/src/Functions/ya.make @@ -53,6 +53,7 @@ SRCS( FunctionsRandom.cpp FunctionsRound.cpp FunctionsStringArray.cpp + FunctionsStringHash.cpp FunctionsStringSimilarity.cpp GatherUtils/concat.cpp GatherUtils/createArraySink.cpp @@ -185,6 +186,7 @@ SRCS( bitBoolMaskAnd.cpp bitBoolMaskOr.cpp bitCount.cpp + bitHammingDistance.cpp bitNot.cpp bitOr.cpp bitRotateLeft.cpp @@ -504,6 +506,7 @@ SRCS( tryBase64Decode.cpp tuple.cpp tupleElement.cpp + tupleHammingDistance.cpp upper.cpp upperUTF8.cpp uptime.cpp diff --git a/src/IO/ReadBufferFromPocoSocket.cpp b/src/IO/ReadBufferFromPocoSocket.cpp index 5c66c3209f6..2c13446e693 100644 --- a/src/IO/ReadBufferFromPocoSocket.cpp +++ b/src/IO/ReadBufferFromPocoSocket.cpp @@ -28,10 +28,23 @@ bool ReadBufferFromPocoSocket::nextImpl() ssize_t bytes_read = 0; Stopwatch watch; + int flags = 0; + if (async_callback) + flags |= MSG_DONTWAIT; + /// Add more details to exceptions. try { - bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size()); + bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size(), flags); + + /// If async_callback is specified, and read is blocking, run async_callback and try again later. + /// It is expected that file descriptor may be polled externally. + /// Note that receive timeout is not checked here. External code should check it while polling. + while (bytes_read < 0 && async_callback && errno == EAGAIN) + { + async_callback(socket); + bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size(), flags); + } } catch (const Poco::Net::NetException & e) { diff --git a/src/IO/ReadBufferFromPocoSocket.h b/src/IO/ReadBufferFromPocoSocket.h index f328b89d99c..8064cd39246 100644 --- a/src/IO/ReadBufferFromPocoSocket.h +++ b/src/IO/ReadBufferFromPocoSocket.h @@ -5,7 +5,6 @@ #include #include - namespace DB { @@ -28,6 +27,11 @@ public: ReadBufferFromPocoSocket(Poco::Net::Socket & socket_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE); bool poll(size_t timeout_microseconds); + + void setAsyncCallback(std::function async_callback_) { async_callback = std::move(async_callback_); } + +private: + std::function async_callback; }; } diff --git a/src/Interpreters/ActionsDAG.cpp b/src/Interpreters/ActionsDAG.cpp index 62cb6047704..2fc78261f17 100644 --- a/src/Interpreters/ActionsDAG.cpp +++ b/src/Interpreters/ActionsDAG.cpp @@ -624,7 +624,7 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions( { auto & input = inputs[res_elem.name]; if (input.empty()) - throw Exception("Cannot find column " + backQuoteIfNeed(res_elem.name) + " in source stream", + throw Exception("Cannot find column " + backQuote(res_elem.name) + " in source stream", ErrorCodes::THERE_IS_NO_COLUMN); src_node = actions_dag->inputs[input.front()]; @@ -641,12 +641,12 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions( if (ignore_constant_values) src_node = const_cast(&actions_dag->addColumn(res_elem, true)); else if (res_const->getField() != src_const->getField()) - throw Exception("Cannot convert column " + backQuoteIfNeed(res_elem.name) + " because " + throw Exception("Cannot convert column " + backQuote(res_elem.name) + " because " "it is constant but values of constants are different in source and result", ErrorCodes::ILLEGAL_COLUMN); } else - throw Exception("Cannot convert column " + backQuoteIfNeed(res_elem.name) + " because " + throw Exception("Cannot convert column " + backQuote(res_elem.name) + " because " "it is non constant in source stream but must be constant in result", ErrorCodes::ILLEGAL_COLUMN); } diff --git a/src/Interpreters/ActionsVisitor.cpp b/src/Interpreters/ActionsVisitor.cpp index 8aaf740b34b..c7bbc019518 100644 --- a/src/Interpreters/ActionsVisitor.cpp +++ b/src/Interpreters/ActionsVisitor.cpp @@ -735,6 +735,28 @@ void ActionsMatcher::visit(const ASTFunction & node, const ASTPtr & ast, Data & } } + if (node.is_window_function) + { + // Also add columns from PARTITION BY and ORDER BY of window functions. + // Requiring a constant reference to a shared pointer to non-const AST + // doesn't really look sane, but the visitor does indeed require it. + if (node.window_partition_by) + { + visit(node.window_partition_by->clone(), data); + } + if (node.window_order_by) + { + visit(node.window_order_by->clone(), data); + } + + // Don't need to do anything more for window functions here -- the + // resulting column is added in ExpressionAnalyzer, similar to the + // aggregate functions. + return; + } + + // An aggregate function can also be calculated as a window function, but we + // checked for it above, so no need to do anything more. if (AggregateFunctionFactory::instance().isAggregateFunctionName(node.name)) return; diff --git a/src/Interpreters/AggregateDescription.cpp b/src/Interpreters/AggregateDescription.cpp index e483eb1b7a1..2748a2abe9d 100644 --- a/src/Interpreters/AggregateDescription.cpp +++ b/src/Interpreters/AggregateDescription.cpp @@ -1,6 +1,7 @@ #include #include #include +#include namespace DB { @@ -99,4 +100,31 @@ void AggregateDescription::explain(WriteBuffer & out, size_t indent) const } } +std::string WindowFunctionDescription::dump() const +{ + WriteBufferFromOwnString ss; + + ss << "window function '" << column_name << "\n"; + ss << "function node " << function_node->dumpTree() << "\n"; + ss << "aggregate function '" << aggregate_function->getName() << "'\n"; + if (!function_parameters.empty()) + { + ss << "parameters " << toString(function_parameters) << "\n"; + } + + return ss.str(); +} + +std::string WindowDescription::dump() const +{ + WriteBufferFromOwnString ss; + + ss << "window '" << window_name << "'\n"; + ss << "partition_by " << dumpSortDescription(partition_by) << "\n"; + ss << "order_by " << dumpSortDescription(order_by) << "\n"; + ss << "full_sort_description " << dumpSortDescription(full_sort_description) << "\n"; + + return ss.str(); +} + } diff --git a/src/Interpreters/AggregateDescription.h b/src/Interpreters/AggregateDescription.h index 396a62c446a..f1fc232d04d 100644 --- a/src/Interpreters/AggregateDescription.h +++ b/src/Interpreters/AggregateDescription.h @@ -1,13 +1,18 @@ #pragma once +#include +#include #include #include -#include +#include +#include namespace DB { +class ASTFunction; + struct AggregateDescription { AggregateFunctionPtr function; @@ -21,4 +26,44 @@ struct AggregateDescription using AggregateDescriptions = std::vector; + +struct WindowFunctionDescription +{ + std::string column_name; + const ASTFunction * function_node; + AggregateFunctionPtr aggregate_function; + Array function_parameters; + DataTypes argument_types; + Names argument_names; + + std::string dump() const; +}; + +struct WindowDescription +{ + std::string window_name; + + // We don't care about the particular order of keys for PARTITION BY, only + // that they are sorted. For now we always require ASC, but we could be more + // flexible and match any direction, or even different order of columns. + SortDescription partition_by; + + SortDescription order_by; + + // To calculate the window function, we sort input data first by PARTITION BY, + // then by ORDER BY. This field holds this combined sort order. + SortDescription full_sort_description; + + // No frame info as of yet. + + // The window functions that are calculated for this window. + std::vector window_functions; + + std::string dump() const; +}; + +using WindowFunctionDescriptions = std::vector; + +using WindowDescriptions = std::unordered_map; + } diff --git a/src/Interpreters/AsynchronousMetrics.cpp b/src/Interpreters/AsynchronousMetrics.cpp index 14cc27f75ea..4fe922252d5 100644 --- a/src/Interpreters/AsynchronousMetrics.cpp +++ b/src/Interpreters/AsynchronousMetrics.cpp @@ -212,18 +212,18 @@ void AsynchronousMetrics::update() { Int64 amount = total_memory_tracker.get(); Int64 peak = total_memory_tracker.getPeak(); - Int64 new_peak = data.resident; + Int64 new_amount = data.resident; LOG_DEBUG(&Poco::Logger::get("AsynchronousMetrics"), "MemoryTracking: was {}, peak {}, will set to {} (RSS), difference: {}", ReadableSize(amount), ReadableSize(peak), - ReadableSize(new_peak), - ReadableSize(new_peak - peak) + ReadableSize(new_amount), + ReadableSize(new_amount - amount) ); - total_memory_tracker.set(new_peak); - CurrentMetrics::set(CurrentMetrics::MemoryTracking, new_peak); + total_memory_tracker.set(new_amount); + CurrentMetrics::set(CurrentMetrics::MemoryTracking, new_amount); } } #endif diff --git a/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp b/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp index e2a7c5b55dc..47726e49d50 100644 --- a/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp +++ b/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp @@ -126,6 +126,7 @@ void SelectStreamFactory::createForShard( bool add_agg_info = processed_stage == QueryProcessingStage::WithMergeableState; bool add_totals = false; bool add_extremes = false; + bool async_read = context_ptr->getSettingsRef().async_socket_for_remote; if (processed_stage == QueryProcessingStage::Complete) { add_totals = query_ast->as().group_by_with_totals; @@ -153,7 +154,7 @@ void SelectStreamFactory::createForShard( if (!table_func_ptr) remote_query_executor->setMainTable(main_table); - remote_pipes.emplace_back(createRemoteSourcePipe(remote_query_executor, add_agg_info, add_totals, add_extremes)); + remote_pipes.emplace_back(createRemoteSourcePipe(remote_query_executor, add_agg_info, add_totals, add_extremes, async_read)); remote_pipes.back().addInterpreterContext(context_ptr); }; @@ -249,7 +250,7 @@ void SelectStreamFactory::createForShard( pool = shard_info.pool, shard_num = shard_info.shard_num, modified_query, header = header, modified_query_ast, &context, context_ptr, throttler, main_table = main_table, table_func_ptr = table_func_ptr, scalars = scalars, external_tables = external_tables, - stage = processed_stage, local_delay, add_agg_info, add_totals, add_extremes]() + stage = processed_stage, local_delay, add_agg_info, add_totals, add_extremes, async_read]() -> Pipe { auto current_settings = context.getSettingsRef(); @@ -295,7 +296,7 @@ void SelectStreamFactory::createForShard( auto remote_query_executor = std::make_shared( std::move(connections), modified_query, header, context, throttler, scalars, external_tables, stage); - return createRemoteSourcePipe(remote_query_executor, add_agg_info, add_totals, add_extremes); + return createRemoteSourcePipe(remote_query_executor, add_agg_info, add_totals, add_extremes, async_read); } }; diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index 1b93c090842..31c12490408 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -50,6 +50,9 @@ #include #include +#include +#include + namespace DB { @@ -58,12 +61,14 @@ using LogAST = DebugASTLog; /// set to true to enable logs namespace ErrorCodes { - extern const int UNKNOWN_TYPE_OF_AST_NODE; - extern const int UNKNOWN_IDENTIFIER; + extern const int BAD_ARGUMENTS; extern const int ILLEGAL_PREWHERE; - extern const int LOGICAL_ERROR; extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER; + extern const int LOGICAL_ERROR; + extern const int NOT_IMPLEMENTED; + extern const int UNKNOWN_IDENTIFIER; + extern const int UNKNOWN_TYPE_OF_AST_NODE; } namespace @@ -283,6 +288,8 @@ void ExpressionAnalyzer::analyzeAggregation() { aggregated_columns = temp_actions->getNamesAndTypesList(); } + + has_window = makeWindowDescriptions(temp_actions); } @@ -444,7 +451,11 @@ bool ExpressionAnalyzer::makeAggregateDescriptions(ActionsDAGPtr & actions) auto it = index.find(name); if (it == index.end()) - throw Exception(ErrorCodes::UNKNOWN_IDENTIFIER, "Unknown identifier (in aggregate function '{}'): {}", node->name, name); + { + throw Exception(ErrorCodes::UNKNOWN_IDENTIFIER, + "Unknown identifier '{}' in aggregate function '{}'", + name, node->formatForErrorMessage()); + } types[i] = (*it)->result_type; aggregate.argument_names[i] = name; @@ -461,6 +472,128 @@ bool ExpressionAnalyzer::makeAggregateDescriptions(ActionsDAGPtr & actions) } +bool ExpressionAnalyzer::makeWindowDescriptions(ActionsDAGPtr & actions) +{ + // Convenient to check here because at least we have the Context. + if (!syntax->window_function_asts.empty() && + !context.getSettingsRef().allow_experimental_window_functions) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "Window functions are not implemented (while processing '{}')", + syntax->window_function_asts[0]->formatForErrorMessage()); + } + + for (const ASTFunction * function_node : syntax->window_function_asts) + { + assert(function_node->is_window_function); + + WindowDescription window_description; + window_description.window_name = function_node->getWindowDescription(); + + if (function_node->window_partition_by) + { + for (const auto & column_ast + : function_node->window_partition_by->children) + { + const auto * with_alias = dynamic_cast( + column_ast.get()); + if (!with_alias) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Expected a column in PARTITION BY for window '{}'," + " got '{}'", window_description.window_name, + column_ast->formatForErrorMessage()); + } + window_description.partition_by.push_back( + SortColumnDescription( + with_alias->getColumnName(), 1 /* direction */, + 1 /* nulls_direction */)); + } + } + + if (function_node->window_order_by) + { + for (const auto & column_ast + : function_node->window_order_by->children) + { + // Parser should have checked that we have a proper element here. + const auto & order_by_element + = column_ast->as(); + // Ignore collation for now. + window_description.order_by.push_back( + SortColumnDescription( + order_by_element.children.front()->getColumnName(), + order_by_element.direction, + order_by_element.nulls_direction)); + } + } + + window_description.full_sort_description = window_description.partition_by; + window_description.full_sort_description.insert( + window_description.full_sort_description.end(), + window_description.order_by.begin(), + window_description.order_by.end()); + + WindowFunctionDescription window_function; + window_function.function_node = function_node; + window_function.column_name + = window_function.function_node->getColumnName(); + window_function.function_parameters + = window_function.function_node->parameters + ? getAggregateFunctionParametersArray( + window_function.function_node->parameters) + : Array(); + + // Requiring a constant reference to a shared pointer to non-const AST + // doesn't really look sane, but the visitor does indeed require it. + // Hence we clone the node (not very sane either, I know). + getRootActionsNoMakeSet(window_function.function_node->clone(), + true, actions); + + const ASTs & arguments + = window_function.function_node->arguments->children; + window_function.argument_types.resize(arguments.size()); + window_function.argument_names.resize(arguments.size()); + const auto & index = actions->getIndex(); + for (size_t i = 0; i < arguments.size(); ++i) + { + const std::string & name = arguments[i]->getColumnName(); + + auto it = index.find(name); + if (it == index.end()) + { + throw Exception(ErrorCodes::UNKNOWN_IDENTIFIER, + "Unknown identifier '{}' in window function '{}'", + name, window_function.function_node->formatForErrorMessage()); + } + + window_function.argument_types[i] = (*it)->result_type; + window_function.argument_names[i] = name; + } + + AggregateFunctionProperties properties; + window_function.aggregate_function + = AggregateFunctionFactory::instance().get( + window_function.function_node->name, + window_function.argument_types, + window_function.function_parameters, properties); + + auto [it, inserted] = window_descriptions.insert( + {window_description.window_name, window_description}); + + if (!inserted) + { + assert(it->second.full_sort_description + == window_description.full_sort_description); + } + + it->second.window_functions.push_back(window_function); + } + + return !syntax->window_function_asts.empty(); +} + + const ASTSelectQuery * ExpressionAnalyzer::getSelectQuery() const { const auto * select_query = query->as(); @@ -831,6 +964,65 @@ void SelectQueryExpressionAnalyzer::appendAggregateFunctionsArguments(Expression getRootActions(argument, only_types, step.actions()); } +void SelectQueryExpressionAnalyzer::appendWindowFunctionsArguments( + ExpressionActionsChain & chain, bool /* only_types */) +{ + ExpressionActionsChain::Step & step = chain.lastStep(aggregated_columns); + + // 1) Add actions for window functions and their arguments; + // 2) Mark the columns that are really required. + for (const auto & [_, w] : window_descriptions) + { + for (const auto & f : w.window_functions) + { + // 1.1) arguments of window functions; + // Requiring a constant reference to a shared pointer to non-const AST + // doesn't really look sane, but the visitor does indeed require it. + getRootActionsNoMakeSet(f.function_node->clone(), + true /* no_subqueries */, step.actions()); + + // 1.2) result of window function: an empty INPUT. + // It is an aggregate function, so it won't be added by getRootActions. + // This is something of a hack. Other options: + // a] do it like aggregate function -- break the chain of actions + // and manually add window functions to the starting list of + // input columns. Logically this is similar to what we're doing + // now, but would require to split the window function processing + // into a full-fledged step after plain functions. This would be + // somewhat cumbersome. With INPUT hack we can avoid a separate + // step and pretend that window functions are almost "normal" + // select functions. The limitation of both these ways is that + // we can't reference window functions in other SELECT + // expressions. + // b] add a WINDOW action type, then sort, then split the chain on + // each WINDOW action and insert the Window pipeline between the + // Expression pipelines. This is a "proper" way that would allow + // us to depend on window functions in other functions. But it's + // complicated so I avoid doing it for now. + ColumnWithTypeAndName col; + col.type = f.aggregate_function->getReturnType(); + col.column = col.type->createColumn(); + col.name = f.column_name; + + step.actions()->addInput(col); + + for (const auto & a : f.function_node->arguments->children) + { + // 2.1) function arguments; + step.required_output.push_back(a->getColumnName()); + } + // 2.2) function result; + step.required_output.push_back(f.column_name); + } + + // 2.3) PARTITION BY and ORDER BY columns. + for (const auto & c : w.full_sort_description) + { + step.required_output.push_back(c.column_name); + } + } +} + bool SelectQueryExpressionAnalyzer::appendHaving(ExpressionActionsChain & chain, bool only_types) { const auto * select_query = getAggregatingQuery(); @@ -855,7 +1047,9 @@ void SelectQueryExpressionAnalyzer::appendSelect(ExpressionActionsChain & chain, getRootActions(select_query->select(), only_types, step.actions()); for (const auto & child : select_query->select()->children) + { step.required_output.push_back(child->getColumnName()); + } } ActionsDAGPtr SelectQueryExpressionAnalyzer::appendOrderBy(ExpressionActionsChain & chain, bool only_types, bool optimize_read_in_order, @@ -1076,6 +1270,7 @@ ExpressionAnalysisResult::ExpressionAnalysisResult( : first_stage(first_stage_) , second_stage(second_stage_) , need_aggregate(query_analyzer.hasAggregation()) + , has_window(query_analyzer.hasWindow()) { /// first_stage: Do I need to perform the first part of the pipeline - running on remote servers during distributed processing. /// second_stage: Do I need to execute the second part of the pipeline - running on the initiating server during distributed processing. @@ -1225,6 +1420,9 @@ ExpressionAnalysisResult::ExpressionAnalysisResult( /// If there is aggregation, we execute expressions in SELECT and ORDER BY on the initiating server, otherwise on the source servers. query_analyzer.appendSelect(chain, only_types || (need_aggregate ? !second_stage : !first_stage)); + + query_analyzer.appendWindowFunctionsArguments(chain, only_types || !first_stage); + selected_columns = chain.getLastStep().required_output; has_order_by = query.orderBy() != nullptr; before_order_and_select = query_analyzer.appendOrderBy( @@ -1321,4 +1519,75 @@ void ExpressionAnalysisResult::checkActions() const } } +std::string ExpressionAnalysisResult::dump() const +{ + WriteBufferFromOwnString ss; + + ss << "need_aggregate " << need_aggregate << "\n"; + ss << "has_order_by " << has_order_by << "\n"; + ss << "has_window " << has_window << "\n"; + + if (before_array_join) + { + ss << "before_array_join " << before_array_join->dumpDAG() << "\n"; + } + + if (array_join) + { + ss << "array_join " << "FIXME doesn't have dump" << "\n"; + } + + if (before_join) + { + ss << "before_join " << before_join->dumpDAG() << "\n"; + } + + if (before_where) + { + ss << "before_where " << before_where->dumpDAG() << "\n"; + } + + if (prewhere_info) + { + ss << "prewhere_info " << prewhere_info->dump() << "\n"; + } + + if (filter_info) + { + ss << "filter_info " << filter_info->dump() << "\n"; + } + + if (before_aggregation) + { + ss << "before_aggregation " << before_aggregation->dumpDAG() << "\n"; + } + + if (before_having) + { + ss << "before_having " << before_having->dumpDAG() << "\n"; + } + + if (before_window) + { + ss << "before_window " << before_window->dumpDAG() << "\n"; + } + + if (before_order_and_select) + { + ss << "before_order_and_select " << before_order_and_select->dumpDAG() << "\n"; + } + + if (before_limit_by) + { + ss << "before_limit_by " << before_limit_by->dumpDAG() << "\n"; + } + + if (final_projection) + { + ss << "final_projection " << final_projection->dumpDAG() << "\n"; + } + + return ss.str(); +} + } diff --git a/src/Interpreters/ExpressionAnalyzer.h b/src/Interpreters/ExpressionAnalyzer.h index 2567b32e37e..fb0cb4ea4c3 100644 --- a/src/Interpreters/ExpressionAnalyzer.h +++ b/src/Interpreters/ExpressionAnalyzer.h @@ -60,6 +60,10 @@ struct ExpressionAnalyzerData NamesAndTypesList aggregation_keys; AggregateDescriptions aggregate_descriptions; + bool has_window = false; + WindowDescriptions window_descriptions; + NamesAndTypesList window_columns; + bool has_global_subqueries = false; /// All new temporary tables obtained by performing the GLOBAL IN/JOIN subqueries. @@ -116,6 +120,9 @@ public: /// Get intermediates for tests const ExpressionAnalyzerData & getAnalyzedData() const { return *this; } + /// A list of windows for window functions. + const WindowDescriptions & windowDescriptions() const { return window_descriptions; } + protected: ExpressionAnalyzer( const ASTPtr & query_, @@ -159,6 +166,8 @@ protected: void analyzeAggregation(); bool makeAggregateDescriptions(ActionsDAGPtr & actions); + bool makeWindowDescriptions(ActionsDAGPtr & actions); + const ASTSelectQuery * getSelectQuery() const; bool isRemoteStorage() const { return syntax->is_remote_storage; } @@ -169,6 +178,8 @@ class SelectQueryExpressionAnalyzer; /// Result of SelectQueryExpressionAnalyzer: expressions for InterpreterSelectQuery struct ExpressionAnalysisResult { + std::string dump() const; + /// Do I need to perform the first part of the pipeline - running on remote servers during distributed processing. bool first_stage = false; /// Do I need to execute the second part of the pipeline - running on the initiating server during distributed processing. @@ -176,6 +187,7 @@ struct ExpressionAnalysisResult bool need_aggregate = false; bool has_order_by = false; + bool has_window = false; bool remove_where_filter = false; bool optimize_read_in_order = false; @@ -189,6 +201,7 @@ struct ExpressionAnalysisResult ActionsDAGPtr before_where; ActionsDAGPtr before_aggregation; ActionsDAGPtr before_having; + ActionsDAGPtr before_window; ActionsDAGPtr before_order_and_select; ActionsDAGPtr before_limit_by; ActionsDAGPtr final_projection; @@ -256,6 +269,7 @@ public: /// Does the expression have aggregate functions or a GROUP BY or HAVING section. bool hasAggregation() const { return has_aggregation; } + bool hasWindow() const { return has_window; } bool hasGlobalSubqueries() { return has_global_subqueries; } bool hasTableJoin() const { return syntax->ast_join; } @@ -326,6 +340,7 @@ private: bool appendWhere(ExpressionActionsChain & chain, bool only_types); bool appendGroupBy(ExpressionActionsChain & chain, bool only_types, bool optimize_aggregation_in_order, ManyExpressionActions &); void appendAggregateFunctionsArguments(ExpressionActionsChain & chain, bool only_types); + void appendWindowFunctionsArguments(ExpressionActionsChain & chain, bool only_types); /// After aggregation: bool appendHaving(ExpressionActionsChain & chain, bool only_types); diff --git a/src/Interpreters/ExtractExpressionInfoVisitor.cpp b/src/Interpreters/ExtractExpressionInfoVisitor.cpp index 75e94de0db5..2d9339447b1 100644 --- a/src/Interpreters/ExtractExpressionInfoVisitor.cpp +++ b/src/Interpreters/ExtractExpressionInfoVisitor.cpp @@ -19,9 +19,18 @@ void ExpressionInfoMatcher::visit(const ASTPtr & ast, Data & data) void ExpressionInfoMatcher::visit(const ASTFunction & ast_function, const ASTPtr &, Data & data) { if (ast_function.name == "arrayJoin") + { data.is_array_join = true; - else if (AggregateFunctionFactory::instance().isAggregateFunctionName(ast_function.name)) + } + // "is_aggregate_function" doesn't mean much by itself. Apparently here it is + // used to move filters from HAVING to WHERE, and probably for this purpose + // an aggregate function calculated as a window function is not relevant. + else if (!ast_function.is_window_function + && AggregateFunctionFactory::instance().isAggregateFunctionName( + ast_function.name)) + { data.is_aggregate_function = true; + } else { const auto & function = FunctionFactory::instance().tryGet(ast_function.name, data.context); diff --git a/src/Interpreters/GetAggregatesVisitor.h b/src/Interpreters/GetAggregatesVisitor.h index ba1501fc624..d416a5f240e 100644 --- a/src/Interpreters/GetAggregatesVisitor.h +++ b/src/Interpreters/GetAggregatesVisitor.h @@ -19,8 +19,12 @@ public: struct Data { const char * assert_no_aggregates = nullptr; - std::unordered_set uniq_names; - std::vector aggregates; + const char * assert_no_windows = nullptr; + // Explicit empty initializers are needed to make designated initializers + // work on GCC 10. + std::unordered_set uniq_names {}; + std::vector aggregates {}; + std::vector window_functions {}; }; static bool needChildVisit(const ASTPtr & node, const ASTPtr & child) @@ -28,8 +32,13 @@ public: if (child->as() || child->as()) return false; if (auto * func = node->as()) - if (isAggregateFunction(func->name)) + { + if (isAggregateFunction(*func) + || func->is_window_function) + { return false; + } + } return true; } @@ -42,33 +51,56 @@ public: private: static void visit(const ASTFunction & node, const ASTPtr &, Data & data) { - if (!isAggregateFunction(node.name)) - return; + if (isAggregateFunction(node)) + { + if (data.assert_no_aggregates) + throw Exception("Aggregate function " + node.getColumnName() + " is found " + String(data.assert_no_aggregates) + " in query", + ErrorCodes::ILLEGAL_AGGREGATION); - if (data.assert_no_aggregates) - throw Exception("Aggregate function " + node.getColumnName() + " is found " + String(data.assert_no_aggregates) + " in query", - ErrorCodes::ILLEGAL_AGGREGATION); + String column_name = node.getColumnName(); + if (data.uniq_names.count(column_name)) + return; - String column_name = node.getColumnName(); - if (data.uniq_names.count(column_name)) - return; + data.uniq_names.insert(column_name); + data.aggregates.push_back(&node); + } + else if (node.is_window_function) + { + if (data.assert_no_windows) + throw Exception("Window function " + node.getColumnName() + " is found " + String(data.assert_no_windows) + " in query", + ErrorCodes::ILLEGAL_AGGREGATION); - data.uniq_names.insert(column_name); - data.aggregates.push_back(&node); + String column_name = node.getColumnName(); + if (data.uniq_names.count(column_name)) + return; + + data.uniq_names.insert(column_name); + data.window_functions.push_back(&node); + } } - static bool isAggregateFunction(const String & name) + static bool isAggregateFunction(const ASTFunction & node) { - return AggregateFunctionFactory::instance().isAggregateFunctionName(name); + // Aggregate functions can also be calculated as window functions, but + // here we are interested in aggregate functions calculated in GROUP BY. + return !node.is_window_function + && AggregateFunctionFactory::instance().isAggregateFunctionName( + node.name); } }; using GetAggregatesVisitor = GetAggregatesMatcher::Visitor; +inline void assertNoWindows(const ASTPtr & ast, const char * description) +{ + GetAggregatesVisitor::Data data{.assert_no_windows = description}; + GetAggregatesVisitor(data).visit(ast); +} + inline void assertNoAggregates(const ASTPtr & ast, const char * description) { - GetAggregatesVisitor::Data data{description, {}, {}}; + GetAggregatesVisitor::Data data{.assert_no_aggregates = description}; GetAggregatesVisitor(data).visit(ast); } diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 1a487f0a861..38cc19a00d6 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -35,36 +35,37 @@ #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include #include +#include #include -#include -#include #include -#include +#include +#include #include -#include +#include +#include #include +#include +#include +#include +#include +#include +#include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include @@ -958,6 +959,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu else { executeExpression(query_plan, expressions.before_order_and_select, "Before ORDER BY and SELECT"); + executeWindow(query_plan); executeDistinct(query_plan, true, expressions.selected_columns, true); } @@ -1004,6 +1006,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu executeHaving(query_plan, expressions.before_having); executeExpression(query_plan, expressions.before_order_and_select, "Before ORDER BY and SELECT"); + executeWindow(query_plan); executeDistinct(query_plan, true, expressions.selected_columns, true); } @@ -1749,6 +1752,58 @@ void InterpreterSelectQuery::executeExpression(QueryPlan & query_plan, const Act } +void InterpreterSelectQuery::executeWindow(QueryPlan & query_plan) +{ + for (const auto & [_, w] : query_analyzer->windowDescriptions()) + { + const Settings & settings = context->getSettingsRef(); + + auto partial_sorting = std::make_unique( + query_plan.getCurrentDataStream(), + w.full_sort_description, + 0 /* LIMIT */, + SizeLimits(settings.max_rows_to_sort, settings.max_bytes_to_sort, + settings.sort_overflow_mode)); + partial_sorting->setStepDescription("Sort each block for window '" + + w.window_name + "'"); + query_plan.addStep(std::move(partial_sorting)); + + auto merge_sorting_step = std::make_unique( + query_plan.getCurrentDataStream(), + w.full_sort_description, + settings.max_block_size, + 0 /* LIMIT */, + settings.max_bytes_before_remerge_sort, + settings.remerge_sort_lowered_memory_bytes_ratio, + settings.max_bytes_before_external_sort, + context->getTemporaryVolume(), + settings.min_free_disk_space_for_temporary_data); + merge_sorting_step->setStepDescription("Merge sorted blocks for window '" + + w.window_name + "'"); + query_plan.addStep(std::move(merge_sorting_step)); + + // First MergeSorted, now MergingSorted. + auto merging_sorted = std::make_unique( + query_plan.getCurrentDataStream(), + w.full_sort_description, + settings.max_block_size, + 0 /* LIMIT */); + merging_sorted->setStepDescription("Merge sorted streams for window '" + + w.window_name + "'"); + query_plan.addStep(std::move(merging_sorted)); + + auto window_step = std::make_unique( + query_plan.getCurrentDataStream(), + w, + w.window_functions); + window_step->setStepDescription("Window step for window '" + + w.window_name + "'"); + + query_plan.addStep(std::move(window_step)); + } +} + + void InterpreterSelectQuery::executeOrderOptimized(QueryPlan & query_plan, InputOrderInfoPtr input_sorting_info, UInt64 limit, SortDescription & output_order_descr) { const Settings & settings = context->getSettingsRef(); @@ -1795,9 +1850,13 @@ void InterpreterSelectQuery::executeOrder(QueryPlan & query_plan, InputOrderInfo /// Merge the sorted blocks. auto merge_sorting_step = std::make_unique( query_plan.getCurrentDataStream(), - output_order_descr, settings.max_block_size, limit, - settings.max_bytes_before_remerge_sort, settings.remerge_sort_lowered_memory_bytes_ratio, - settings.max_bytes_before_external_sort, context->getTemporaryVolume(), + output_order_descr, + settings.max_block_size, + limit, + settings.max_bytes_before_remerge_sort, + settings.remerge_sort_lowered_memory_bytes_ratio, + settings.max_bytes_before_external_sort, + context->getTemporaryVolume(), settings.min_free_disk_space_for_temporary_data); merge_sorting_step->setStepDescription("Merge sorted blocks for ORDER BY"); diff --git a/src/Interpreters/InterpreterSelectQuery.h b/src/Interpreters/InterpreterSelectQuery.h index d82ea4e1187..1fff316e1d4 100644 --- a/src/Interpreters/InterpreterSelectQuery.h +++ b/src/Interpreters/InterpreterSelectQuery.h @@ -120,6 +120,8 @@ private: void executeTotalsAndHaving(QueryPlan & query_plan, bool has_having, const ActionsDAGPtr & expression, bool overflow_row, bool final); void executeHaving(QueryPlan & query_plan, const ActionsDAGPtr & expression); static void executeExpression(QueryPlan & query_plan, const ActionsDAGPtr & expression, const std::string & description); + /// FIXME should go through ActionsDAG to behave as a proper function + void executeWindow(QueryPlan & query_plan); void executeOrder(QueryPlan & query_plan, InputOrderInfoPtr sorting_info); void executeOrderOptimized(QueryPlan & query_plan, InputOrderInfoPtr sorting_info, UInt64 limit, SortDescription & output_order_descr); void executeWithFill(QueryPlan & query_plan); diff --git a/src/Interpreters/MonotonicityCheckVisitor.h b/src/Interpreters/MonotonicityCheckVisitor.h index 137f8d25b4a..87571a44eb0 100644 --- a/src/Interpreters/MonotonicityCheckVisitor.h +++ b/src/Interpreters/MonotonicityCheckVisitor.h @@ -43,9 +43,14 @@ public: if (group_by_function_hashes.count(key)) return false; - /// if ORDER BY contains aggregate function it shouldn't be optimized - if (AggregateFunctionFactory::instance().isAggregateFunctionName(ast_function.name)) + /// if ORDER BY contains aggregate function or window functions, it + /// shouldn't be optimized + if (ast_function.is_window_function + || AggregateFunctionFactory::instance().isAggregateFunctionName( + ast_function.name)) + { return false; + } return true; } diff --git a/src/Interpreters/RewriteAnyFunctionVisitor.cpp b/src/Interpreters/RewriteAnyFunctionVisitor.cpp index 7c3f1bf31b2..e8f05962862 100644 --- a/src/Interpreters/RewriteAnyFunctionVisitor.cpp +++ b/src/Interpreters/RewriteAnyFunctionVisitor.cpp @@ -38,8 +38,16 @@ bool extractIdentifiers(const ASTFunction & func, std::unordered_set & if (arg_func->name == "lambda") return false; - if (AggregateFunctionFactory::instance().isAggregateFunctionName(arg_func->name)) + // We are looking for identifiers inside a function calculated inside + // the aggregate function `any()`. Window or aggregate function can't + // be inside `any`, but this check in GetAggregatesMatcher happens + // later, so we have to explicitly skip these nested functions here. + if (arg_func->is_window_function + || AggregateFunctionFactory::instance().isAggregateFunctionName( + arg_func->name)) + { return false; + } if (!extractIdentifiers(*arg_func, identifiers)) return false; diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index aa59696ec76..2c77212ffc4 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -439,12 +439,46 @@ std::vector getAggregates(ASTPtr & query, const ASTSelectQu /// There can not be other aggregate functions within the aggregate functions. for (const ASTFunction * node : data.aggregates) + { if (node->arguments) + { for (auto & arg : node->arguments->children) + { assertNoAggregates(arg, "inside another aggregate function"); + assertNoWindows(arg, "inside an aggregate function"); + } + } + } return data.aggregates; } +std::vector getWindowFunctions(ASTPtr & query, const ASTSelectQuery & select_query) +{ + /// There can not be window functions inside the WHERE and PREWHERE. + if (select_query.where()) + assertNoWindows(select_query.where(), "in WHERE"); + if (select_query.prewhere()) + assertNoWindows(select_query.prewhere(), "in PREWHERE"); + + GetAggregatesVisitor::Data data; + GetAggregatesVisitor(data).visit(query); + + /// There can not be other window functions within the aggregate functions. + for (const ASTFunction * node : data.window_functions) + { + if (node->arguments) + { + for (auto & arg : node->arguments->children) + { + assertNoAggregates(arg, "inside a window function"); + assertNoWindows(arg, "inside another window function"); + } + } + } + + return data.window_functions; +} + } TreeRewriterResult::TreeRewriterResult( @@ -640,14 +674,24 @@ void TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select for (const auto & name : columns_context.requiredColumns()) ss << " '" << name << "'"; - if (!source_column_names.empty()) + if (storage) { - ss << ", source columns:"; - for (const auto & name : source_column_names) - ss << " '" << name << "'"; + ss << ", maybe you meant: "; + for (const auto & name : columns_context.requiredColumns()) + { + auto hints = storage->getHints(name); + if (!hints.empty()) + ss << " '" << toString(hints) << "'"; + } } else - ss << ", no source columns"; + { + if (!source_column_names.empty()) + for (const auto & name : columns_context.requiredColumns()) + ss << " '" << name << "'"; + else + ss << ", no source columns"; + } if (columns_context.has_table_join) { @@ -733,6 +777,7 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect( collectJoinedColumns(*result.analyzed_join, *select_query, tables_with_columns, result.aliases); result.aggregates = getAggregates(query, *select_query); + result.window_function_asts = getWindowFunctions(query, *select_query); result.collectUsedColumns(query, true); result.ast_join = select_query->join(); diff --git a/src/Interpreters/TreeRewriter.h b/src/Interpreters/TreeRewriter.h index 4075be4326d..d9f98ee40bd 100644 --- a/src/Interpreters/TreeRewriter.h +++ b/src/Interpreters/TreeRewriter.h @@ -35,6 +35,8 @@ struct TreeRewriterResult Aliases aliases; std::vector aggregates; + std::vector window_function_asts; + /// Which column is needed to be ARRAY-JOIN'ed to get the specified. /// For example, for `SELECT s.v ... ARRAY JOIN a AS s` will get "s.v" -> "a.v". NameToNameMap array_join_result_to_source; diff --git a/src/Interpreters/tests/CMakeLists.txt b/src/Interpreters/tests/CMakeLists.txt index 1bc9d7fbacb..2c8440299fb 100644 --- a/src/Interpreters/tests/CMakeLists.txt +++ b/src/Interpreters/tests/CMakeLists.txt @@ -32,6 +32,9 @@ target_link_libraries (string_hash_map_aggregation PRIVATE dbms) add_executable (string_hash_set string_hash_set.cpp) target_link_libraries (string_hash_set PRIVATE dbms) +add_executable (context context.cpp) +target_link_libraries (context PRIVATE dbms) + add_executable (two_level_hash_map two_level_hash_map.cpp) target_include_directories (two_level_hash_map SYSTEM BEFORE PRIVATE ${SPARSEHASH_INCLUDE_DIR}) target_link_libraries (two_level_hash_map PRIVATE dbms) diff --git a/src/Interpreters/tests/context.cpp b/src/Interpreters/tests/context.cpp new file mode 100644 index 00000000000..9b908e26248 --- /dev/null +++ b/src/Interpreters/tests/context.cpp @@ -0,0 +1,90 @@ +#include +/// #define BOOST_USE_UCONTEXT +#include +// #include +// #include +#include +#include + +void __attribute__((__noinline__)) foo(std::exception_ptr exception) +{ + if (exception) + std::rethrow_exception(exception); +} + +void __attribute__((__noinline__)) bar(int a) +{ + std::cout << StackTrace().toString() << std::endl; + + if (a > 0) + throw DB::Exception(0, "hello"); +} + +void __attribute__((__noinline__)) gar(int a) +{ + char buf[1024]; + buf[1023] = a & 255; + if (a > 2) + return gar(a - 1); + else + bar(a); +} + +int main(int, char **) +try { + namespace ctx=boost::context; + int a; + std::exception_ptr exception; + // ctx::protected_fixedsize allocator + // ctx::pooled_fixedsize_stack(1024 * 64 + 2 * 2 * 1024 * 1024 * 16, 1) + ctx::fiber source{std::allocator_arg_t(), FiberStack(), [&](ctx::fiber&& sink) + { + a=0; + int b=1; + for (size_t i = 0; i < 9; ++i) + { + sink=std::move(sink).resume(); + int next=a+b; + a=b; + b=next; + } + try + { + gar(1024); + } + catch (...) + { + std::cout << "Saving exception\n"; + exception = std::current_exception(); + } + return std::move(sink); + }}; + + for (int j=0;j<10;++j) + { + try + { + source=std::move(source).resume(); + } + catch (DB::Exception & e) + { + std::cout << "Caught exception in resume " << e.getStackTraceString() << std::endl; + } + std::cout << a << " "; + } + + std::cout << std::endl; + + try + { + foo(exception); + } + catch (const DB::Exception & e) + { + std::cout << e.getStackTraceString() << std::endl; + } +} +catch (...) +{ + std::cerr << "Uncaught exception\n"; +} diff --git a/src/Parsers/ASTFunction.cpp b/src/Parsers/ASTFunction.cpp index c86a8e5c139..d5d03a540c9 100644 --- a/src/Parsers/ASTFunction.cpp +++ b/src/Parsers/ASTFunction.cpp @@ -1,14 +1,15 @@ -#include -#include #include -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include namespace DB { @@ -54,6 +55,21 @@ ASTPtr ASTFunction::clone() const if (arguments) { res->arguments = arguments->clone(); res->children.push_back(res->arguments); } if (parameters) { res->parameters = parameters->clone(); res->children.push_back(res->parameters); } + if (window_name) + { + res->set(res->window_name, window_name->clone()); + } + + if (window_partition_by) + { + res->set(res->window_partition_by, window_partition_by->clone()); + } + + if (window_order_by) + { + res->set(res->window_order_by, window_order_by->clone()); + } + return res; } @@ -411,44 +427,91 @@ void ASTFunction::formatImplWithoutAlias(const FormatSettings & settings, Format } } - if (!written) + if (written) { - settings.ostr << (settings.hilite ? hilite_function : "") << name; + return; + } - if (parameters) + settings.ostr << (settings.hilite ? hilite_function : "") << name; + + if (parameters) + { + settings.ostr << '(' << (settings.hilite ? hilite_none : ""); + parameters->formatImpl(settings, state, nested_dont_need_parens); + settings.ostr << (settings.hilite ? hilite_function : "") << ')'; + } + + if ((arguments && !arguments->children.empty()) || !no_empty_args) + settings.ostr << '(' << (settings.hilite ? hilite_none : ""); + + if (arguments) + { + bool special_hilite_regexp = settings.hilite + && (name == "match" || name == "extract" || name == "extractAll" || name == "replaceRegexpOne" + || name == "replaceRegexpAll"); + + for (size_t i = 0, size = arguments->children.size(); i < size; ++i) { - settings.ostr << '(' << (settings.hilite ? hilite_none : ""); - parameters->formatImpl(settings, state, nested_dont_need_parens); - settings.ostr << (settings.hilite ? hilite_function : "") << ')'; + if (i != 0) + settings.ostr << ", "; + + bool special_hilite = false; + if (i == 1 && special_hilite_regexp) + special_hilite = highlightStringLiteralWithMetacharacters(arguments->children[i], settings, "|()^$.[]?*+{:-"); + + if (!special_hilite) + arguments->children[i]->formatImpl(settings, state, nested_dont_need_parens); } + } - if ((arguments && !arguments->children.empty()) || !no_empty_args) - settings.ostr << '(' << (settings.hilite ? hilite_none : ""); + if ((arguments && !arguments->children.empty()) || !no_empty_args) + settings.ostr << (settings.hilite ? hilite_function : "") << ')'; - if (arguments) - { - bool special_hilite_regexp = settings.hilite - && (name == "match" || name == "extract" || name == "extractAll" || name == "replaceRegexpOne" - || name == "replaceRegexpAll"); + settings.ostr << (settings.hilite ? hilite_none : ""); - for (size_t i = 0, size = arguments->children.size(); i < size; ++i) - { - if (i != 0) - settings.ostr << ", "; + if (!is_window_function) + { + return; + } - bool special_hilite = false; - if (i == 1 && special_hilite_regexp) - special_hilite = highlightStringLiteralWithMetacharacters(arguments->children[i], settings, "|()^$.[]?*+{:-"); + settings.ostr << " OVER ("; + appendWindowDescription(settings, state, nested_dont_need_parens); + settings.ostr << ")"; +} - if (!special_hilite) - arguments->children[i]->formatImpl(settings, state, nested_dont_need_parens); - } - } +std::string ASTFunction::getWindowDescription() const +{ + WriteBufferFromOwnString ostr; + FormatSettings settings{ostr, true /* one_line */}; + FormatState state; + FormatStateStacked frame; + appendWindowDescription(settings, state, frame); + return ostr.str(); +} - if ((arguments && !arguments->children.empty()) || !no_empty_args) - settings.ostr << (settings.hilite ? hilite_function : "") << ')'; +void ASTFunction::appendWindowDescription(const FormatSettings & settings, + FormatState & state, FormatStateStacked frame) const +{ + if (!is_window_function) + { + return; + } - settings.ostr << (settings.hilite ? hilite_none : ""); + if (window_partition_by) + { + settings.ostr << "PARTITION BY "; + window_partition_by->formatImpl(settings, state, frame); + } + + if (window_partition_by && window_order_by) + { + settings.ostr << " "; + } + + if (window_order_by) + { + settings.ostr << "ORDER BY "; + window_order_by->formatImpl(settings, state, frame); } } diff --git a/src/Parsers/ASTFunction.h b/src/Parsers/ASTFunction.h index a6e3834ac1a..38e5f3f095c 100644 --- a/src/Parsers/ASTFunction.h +++ b/src/Parsers/ASTFunction.h @@ -8,6 +8,8 @@ namespace DB { +class ASTIdentifier; + /** AST for function application or operator. */ class ASTFunction : public ASTWithAlias @@ -18,6 +20,11 @@ public: /// parameters - for parametric aggregate function. Example: quantile(0.9)(x) - what in first parens are 'parameters'. ASTPtr parameters; + bool is_window_function = false; + ASTIdentifier * window_name; + ASTExpressionList * window_partition_by; + ASTExpressionList * window_order_by; + /// do not print empty parentheses if there are no args - compatibility with new AST for data types and engine names. bool no_empty_args = false; @@ -32,6 +39,11 @@ public: ASTPtr toLiteral() const; // Try to convert functions like Array or Tuple to a literal form. + void appendWindowDescription(const FormatSettings & settings, + FormatState & state, FormatStateStacked frame) const; + + std::string getWindowDescription() const; + protected: void formatImplWithoutAlias(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; void appendColumnNameImpl(WriteBuffer & ostr) const override; diff --git a/src/Parsers/ASTIndexDeclaration.cpp b/src/Parsers/ASTIndexDeclaration.cpp index 0e8f0d0f7e8..d8ebf825674 100644 --- a/src/Parsers/ASTIndexDeclaration.cpp +++ b/src/Parsers/ASTIndexDeclaration.cpp @@ -1,6 +1,8 @@ #include + #include #include +#include namespace DB diff --git a/src/Parsers/ASTIndexDeclaration.h b/src/Parsers/ASTIndexDeclaration.h index 64ef6eb2db1..8416ec6b0a6 100644 --- a/src/Parsers/ASTIndexDeclaration.h +++ b/src/Parsers/ASTIndexDeclaration.h @@ -1,12 +1,12 @@ #pragma once -#include #include - namespace DB { +class ASTFunction; + /** name BY expr TYPE typename(args) GRANULARITY int in create query */ class ASTIndexDeclaration : public IAST diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp index 21dd6274739..726e28005e3 100644 --- a/src/Parsers/ExpressionElementParsers.cpp +++ b/src/Parsers/ExpressionElementParsers.cpp @@ -263,6 +263,7 @@ bool ParserFunction::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) ParserKeyword distinct("DISTINCT"); ParserExpressionList contents(false); ParserSelectWithUnionQuery select; + ParserKeyword over("OVER"); bool has_distinct_modifier = false; @@ -382,10 +383,96 @@ bool ParserFunction::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) function_node->children.push_back(function_node->parameters); } + if (over.ignore(pos, expected)) + { + function_node->is_window_function = true; + + // We are slightly breaking the parser interface by parsing the window + // definition into an existing ASTFunction. Normally it would take a + // reference to ASTPtr and assign it the new node. We only have a pointer + // of a different type, hence this workaround with a temporary pointer. + ASTPtr function_node_as_iast = function_node; + + ParserWindowDefinition window_definition; + if (!window_definition.parse(pos, function_node_as_iast, expected)) + { + return false; + } + } + node = function_node; return true; } +bool ParserWindowDefinition::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) +{ + ASTFunction * function = dynamic_cast(node.get()); + + // Variant 1: + // function_name ( * ) OVER window_name + // FIXME doesn't work anyway for now -- never used anywhere, window names + // can't be defined, and TreeRewriter thinks the window name is a column so + // the query fails. + if (pos->type != TokenType::OpeningRoundBracket) + { + ASTPtr window_name_ast; + ParserIdentifier window_name_parser; + if (window_name_parser.parse(pos, window_name_ast, expected)) + { + function->set(function->window_name, window_name_ast); + return true; + } + else + { + return false; + } + } + ++pos; + + // Variant 2: + // function_name ( * ) OVER ( window_definition ) + ParserKeyword keyword_partition_by("PARTITION BY"); + ParserNotEmptyExpressionList columns_partition_by( + false /* we don't allow declaring aliases here*/); + ParserKeyword keyword_order_by("ORDER BY"); + ParserOrderByExpressionList columns_order_by; + + if (keyword_partition_by.ignore(pos, expected)) + { + ASTPtr partition_by_ast; + if (columns_partition_by.parse(pos, partition_by_ast, expected)) + { + function->set(function->window_partition_by, partition_by_ast); + } + else + { + return false; + } + } + + if (keyword_order_by.ignore(pos, expected)) + { + ASTPtr order_by_ast; + if (columns_order_by.parse(pos, order_by_ast, expected)) + { + function->set(function->window_order_by, order_by_ast); + } + else + { + return false; + } + } + + if (pos->type != TokenType::ClosingRoundBracket) + { + expected.add(pos, "')'"); + return false; + } + ++pos; + + return true; +} + bool ParserCodecDeclarationList::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { return ParserList(std::make_unique(), diff --git a/src/Parsers/ExpressionElementParsers.h b/src/Parsers/ExpressionElementParsers.h index 6de32ab731d..917f084a700 100644 --- a/src/Parsers/ExpressionElementParsers.h +++ b/src/Parsers/ExpressionElementParsers.h @@ -156,6 +156,13 @@ protected: bool allow_function_parameters; }; +// Window definition (the thing that goes after OVER) for window function. +class ParserWindowDefinition : public IParserBase +{ + const char * getName() const override { return "window definition"; } + bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; +}; + class ParserCodecDeclarationList : public IParserBase { protected: diff --git a/src/Parsers/IAST.cpp b/src/Parsers/IAST.cpp index 56fffe3891a..e223235b8e4 100644 --- a/src/Parsers/IAST.cpp +++ b/src/Parsers/IAST.cpp @@ -161,4 +161,11 @@ void IAST::dumpTree(WriteBuffer & ostr, size_t indent) const } } +std::string IAST::dumpTree(size_t indent) const +{ + WriteBufferFromOwnString wb; + dumpTree(wb, indent); + return wb.str(); +} + } diff --git a/src/Parsers/IAST.h b/src/Parsers/IAST.h index 01ce4971c45..bed6c5bcdf9 100644 --- a/src/Parsers/IAST.h +++ b/src/Parsers/IAST.h @@ -19,9 +19,6 @@ namespace DB namespace ErrorCodes { - extern const int NOT_A_COLUMN; - extern const int UNKNOWN_TYPE_OF_AST_NODE; - extern const int UNKNOWN_ELEMENT_IN_AST; extern const int LOGICAL_ERROR; } @@ -46,7 +43,7 @@ public: String getColumnName() const; virtual void appendColumnName(WriteBuffer &) const { - throw Exception("Trying to get name of not a column: " + getID(), ErrorCodes::NOT_A_COLUMN); + throw Exception("Trying to get name of not a column: " + getID(), ErrorCodes::LOGICAL_ERROR); } /** Get the alias, if any, or the canonical name of the column, if it is not. */ @@ -58,7 +55,7 @@ public: /** Set the alias. */ virtual void setAlias(const String & /*to*/) { - throw Exception("Can't set alias of " + getColumnName(), ErrorCodes::UNKNOWN_TYPE_OF_AST_NODE); + throw Exception("Can't set alias of " + getColumnName(), ErrorCodes::LOGICAL_ERROR); } /** Get the text that identifies this element. */ @@ -77,6 +74,7 @@ public: virtual void updateTreeHashImpl(SipHash & hash_state) const; void dumpTree(WriteBuffer & ostr, size_t indent = 0) const; + std::string dumpTree(size_t indent = 0) const; /** Check the depth of the tree. * If max_depth is specified and the depth is greater - throw an exception. @@ -160,6 +158,7 @@ public: bool always_quote_identifiers = false; IdentifierQuotingStyle identifier_quoting_style = IdentifierQuotingStyle::Backticks; + // Newline or whitespace. char nl_or_ws; FormatSettings(WriteBuffer & ostr_, bool one_line_) @@ -208,7 +207,7 @@ public: virtual void formatImpl(const FormatSettings & /*settings*/, FormatState & /*state*/, FormatStateStacked /*frame*/) const { - throw Exception("Unknown element in AST: " + getID(), ErrorCodes::UNKNOWN_ELEMENT_IN_AST); + throw Exception("Unknown element in AST: " + getID(), ErrorCodes::LOGICAL_ERROR); } // A simple way to add some user-readable context to an error message. diff --git a/src/Parsers/New/AST/TableElementExpr.cpp b/src/Parsers/New/AST/TableElementExpr.cpp index 1336ea06b27..d994ea7eef6 100644 --- a/src/Parsers/New/AST/TableElementExpr.cpp +++ b/src/Parsers/New/AST/TableElementExpr.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include diff --git a/src/Processors/Executors/PipelineExecutor.cpp b/src/Processors/Executors/PipelineExecutor.cpp index 517e07a3ba4..1646a1a01fe 100644 --- a/src/Processors/Executors/PipelineExecutor.cpp +++ b/src/Processors/Executors/PipelineExecutor.cpp @@ -164,7 +164,7 @@ bool PipelineExecutor::expandPipeline(Stack & stack, UInt64 pid) return true; } -bool PipelineExecutor::tryAddProcessorToStackIfUpdated(ExecutingGraph::Edge & edge, Queue & queue, size_t thread_number) +bool PipelineExecutor::tryAddProcessorToStackIfUpdated(ExecutingGraph::Edge & edge, Queue & queue, Queue & async_queue, size_t thread_number) { /// In this method we have ownership on edge, but node can be concurrently accessed. @@ -185,7 +185,7 @@ bool PipelineExecutor::tryAddProcessorToStackIfUpdated(ExecutingGraph::Edge & ed if (status == ExecutingGraph::ExecStatus::Idle) { node.status = ExecutingGraph::ExecStatus::Preparing; - return prepareProcessor(edge.to, thread_number, queue, std::move(lock)); + return prepareProcessor(edge.to, thread_number, queue, async_queue, std::move(lock)); } else graph->nodes[edge.to]->processor->onUpdatePorts(); @@ -193,7 +193,7 @@ bool PipelineExecutor::tryAddProcessorToStackIfUpdated(ExecutingGraph::Edge & ed return true; } -bool PipelineExecutor::prepareProcessor(UInt64 pid, size_t thread_number, Queue & queue, std::unique_lock node_lock) +bool PipelineExecutor::prepareProcessor(UInt64 pid, size_t thread_number, Queue & queue, Queue & async_queue, std::unique_lock node_lock) { /// In this method we have ownership on node. auto & node = *graph->nodes[pid]; @@ -248,15 +248,9 @@ bool PipelineExecutor::prepareProcessor(UInt64 pid, size_t thread_number, Queue } case IProcessor::Status::Async: { - throw Exception("Async is temporary not supported.", ErrorCodes::LOGICAL_ERROR); - -// node.status = ExecStatus::Executing; -// addAsyncJob(pid); -// break; - } - case IProcessor::Status::Wait: - { - throw Exception("Wait is temporary not supported.", ErrorCodes::LOGICAL_ERROR); + node.status = ExecutingGraph::ExecStatus::Executing; + async_queue.push(&node); + break; } case IProcessor::Status::ExpandPipeline: { @@ -288,13 +282,13 @@ bool PipelineExecutor::prepareProcessor(UInt64 pid, size_t thread_number, Queue { for (auto & edge : updated_direct_edges) { - if (!tryAddProcessorToStackIfUpdated(*edge, queue, thread_number)) + if (!tryAddProcessorToStackIfUpdated(*edge, queue, async_queue, thread_number)) return false; } for (auto & edge : updated_back_edges) { - if (!tryAddProcessorToStackIfUpdated(*edge, queue, thread_number)) + if (!tryAddProcessorToStackIfUpdated(*edge, queue, async_queue, thread_number)) return false; } } @@ -325,7 +319,7 @@ bool PipelineExecutor::prepareProcessor(UInt64 pid, size_t thread_number, Queue while (!stack.empty()) { auto item = stack.top(); - if (!prepareProcessor(item, thread_number, queue, std::unique_lock(graph->nodes[item]->status_mutex))) + if (!prepareProcessor(item, thread_number, queue, async_queue, std::unique_lock(graph->nodes[item]->status_mutex))) return false; stack.pop(); @@ -378,6 +372,7 @@ void PipelineExecutor::finish() { std::lock_guard lock(task_queue_mutex); finished = true; + async_task_queue.finish(); } std::lock_guard guard(executor_contexts_mutex); @@ -502,11 +497,21 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, size_t num_threads, st { std::unique_lock lock(task_queue_mutex); - if (!task_queue.empty()) + if (!context->async_tasks.empty()) { + node = context->async_tasks.front(); + context->async_tasks.pop(); + --num_waiting_async_tasks; + + if (context->async_tasks.empty()) + context->has_async_tasks = false; + } + else if (!task_queue.empty()) node = task_queue.pop(thread_num); - if (!task_queue.empty() && !threads_queue.empty() /*&& task_queue.quota() > threads_queue.size()*/) + if (node) + { + if (!task_queue.empty() && !threads_queue.empty()) { auto thread_to_wake = task_queue.getAnyThreadWithTasks(thread_num + 1 == num_threads ? 0 : (thread_num + 1)); @@ -522,13 +527,26 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, size_t num_threads, st break; } - if (threads_queue.size() + 1 == num_threads) + if (threads_queue.size() + 1 == num_threads && async_task_queue.empty() && num_waiting_async_tasks == 0) { lock.unlock(); finish(); break; } +#if defined(OS_LINUX) + if (num_threads == 1) + { + /// If we execute in single thread, wait for async tasks here. + auto res = async_task_queue.wait(lock); + if (!res) + throw Exception("Empty task was returned from async task queue", ErrorCodes::LOGICAL_ERROR); + + node = static_cast(res.data); + break; + } +#endif + threads_queue.push(thread_num); } @@ -579,6 +597,7 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, size_t num_threads, st /// Try to execute neighbour processor. { Queue queue; + Queue async_queue; ++num_processing_executors; while (auto * task = expand_pipeline_task.load()) @@ -587,31 +606,39 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, size_t num_threads, st /// Prepare processor after execution. { auto lock = std::unique_lock(node->status_mutex); - if (!prepareProcessor(node->processors_id, thread_num, queue, std::move(lock))) + if (!prepareProcessor(node->processors_id, thread_num, queue, async_queue, std::move(lock))) finish(); } node = nullptr; /// Take local task from queue if has one. - if (!queue.empty()) + if (!queue.empty() && !context->has_async_tasks) { node = queue.front(); queue.pop(); } /// Push other tasks to global queue. - if (!queue.empty()) + if (!queue.empty() || !async_queue.empty()) { std::unique_lock lock(task_queue_mutex); +#if defined(OS_LINUX) + while (!async_queue.empty() && !finished) + { + async_task_queue.addTask(thread_num, async_queue.front(), async_queue.front()->processor->schedule()); + async_queue.pop(); + } +#endif + while (!queue.empty() && !finished) { task_queue.push(queue.front(), thread_num); queue.pop(); } - if (!threads_queue.empty() && !finished /* && task_queue.quota() > threads_queue.size()*/) + if (!threads_queue.empty() && !task_queue.empty() && !finished) { auto thread_to_wake = task_queue.getAnyThreadWithTasks(thread_num + 1 == num_threads ? 0 : (thread_num + 1)); @@ -669,6 +696,7 @@ void PipelineExecutor::initializeExecution(size_t num_threads) std::lock_guard lock(task_queue_mutex); Queue queue; + Queue async_queue; size_t next_thread = 0; while (!stack.empty()) @@ -676,7 +704,7 @@ void PipelineExecutor::initializeExecution(size_t num_threads) UInt64 proc = stack.top(); stack.pop(); - prepareProcessor(proc, 0, queue, std::unique_lock(graph->nodes[proc]->status_mutex)); + prepareProcessor(proc, 0, queue, async_queue, std::unique_lock(graph->nodes[proc]->status_mutex)); while (!queue.empty()) { @@ -687,6 +715,10 @@ void PipelineExecutor::initializeExecution(size_t num_threads) if (next_thread >= num_threads) next_thread = 0; } + + while (!async_queue.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Async is only possible after work() call. Processor {}", + async_queue.front()->processor->getName()); } } } @@ -747,6 +779,26 @@ void PipelineExecutor::executeImpl(size_t num_threads) }); } +#if defined(OS_LINUX) + { + /// Wait for async tasks. + std::unique_lock lock(task_queue_mutex); + while (auto task = async_task_queue.wait(lock)) + { + auto * node = static_cast(task.data); + executor_contexts[task.thread_num]->async_tasks.push(node); + executor_contexts[task.thread_num]->has_async_tasks = true; + ++num_waiting_async_tasks; + + if (threads_queue.has(task.thread_num)) + { + threads_queue.pop(task.thread_num); + wakeUpExecutor(task.thread_num); + } + } + } +#endif + for (auto & thread : threads) if (thread.joinable()) thread.join(); diff --git a/src/Processors/Executors/PipelineExecutor.h b/src/Processors/Executors/PipelineExecutor.h index b457cca34b1..213446ad43f 100644 --- a/src/Processors/Executors/PipelineExecutor.h +++ b/src/Processors/Executors/PipelineExecutor.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -57,6 +58,12 @@ private: /// Stores processors need to be prepared. Preparing status is already set for them. TaskQueue task_queue; + /// Queue which stores tasks where processors returned Async status after prepare. + /// If multiple threads are using, main thread will wait for async tasks. + /// For single thread, will wait for async tasks only when task_queue is empty. + PollingQueue async_task_queue; + size_t num_waiting_async_tasks = 0; + ThreadsQueue threads_queue; std::mutex task_queue_mutex; @@ -90,6 +97,9 @@ private: /// This can be solved by using atomic shard ptr. std::list task_list; + std::queue async_tasks; + std::atomic_bool has_async_tasks = false; + std::condition_variable condvar; std::mutex mutex; bool wake_flag = false; @@ -126,14 +136,14 @@ private: /// Pipeline execution related methods. void addChildlessProcessorsToStack(Stack & stack); - bool tryAddProcessorToStackIfUpdated(ExecutingGraph::Edge & edge, Queue & queue, size_t thread_number); + bool tryAddProcessorToStackIfUpdated(ExecutingGraph::Edge & edge, Queue & queue, Queue & async_queue, size_t thread_number); static void addJob(ExecutingGraph::Node * execution_state); // TODO: void addAsyncJob(UInt64 pid); /// Prepare processor with pid number. /// Check parents and children of current processor and push them to stacks if they also need to be prepared. /// If processor wants to be expanded, ExpandPipelineTask from thread_number's execution context will be used. - bool prepareProcessor(UInt64 pid, size_t thread_number, Queue & queue, std::unique_lock node_lock); + bool prepareProcessor(UInt64 pid, size_t thread_number, Queue & queue, Queue & async_queue, std::unique_lock node_lock); bool doExpandPipeline(ExpandPipelineTask * task, bool processing); /// Continue executor (in case there are tasks in queue). diff --git a/src/Processors/Executors/PollingQueue.cpp b/src/Processors/Executors/PollingQueue.cpp new file mode 100644 index 00000000000..7383824a592 --- /dev/null +++ b/src/Processors/Executors/PollingQueue.cpp @@ -0,0 +1,115 @@ +#include + +#if defined(OS_LINUX) + +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_OPEN_FILE; + extern const int CANNOT_READ_FROM_SOCKET; + extern const int LOGICAL_ERROR; +} + + +PollingQueue::PollingQueue() +{ + epoll_fd = epoll_create(1); + if (-1 == epoll_fd) + throwFromErrno("Cannot create epoll descriptor", ErrorCodes::CANNOT_OPEN_FILE); + + if (-1 == pipe2(pipe_fd, O_NONBLOCK)) + throwFromErrno("Cannot create pipe", ErrorCodes::CANNOT_OPEN_FILE); + + epoll_event socket_event; + socket_event.events = EPOLLIN | EPOLLPRI; + socket_event.data.ptr = pipe_fd; + + if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, pipe_fd[0], &socket_event)) + throwFromErrno("Cannot add pipe descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE); +} + +PollingQueue::~PollingQueue() +{ + close(epoll_fd); + close(pipe_fd[0]); + close(pipe_fd[1]); +} + +void PollingQueue::addTask(size_t thread_number, void * data, int fd) +{ + std::uintptr_t key = reinterpret_cast(data); + if (tasks.count(key)) + throw Exception("Task was already added to task queue", ErrorCodes::LOGICAL_ERROR); + + tasks[key] = TaskData{thread_number, data, fd}; + + epoll_event socket_event; + socket_event.events = EPOLLIN | EPOLLPRI; + socket_event.data.ptr = data; + + if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &socket_event)) + throwFromErrno("Cannot add socket descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE); +} + +PollingQueue::TaskData PollingQueue::wait(std::unique_lock & lock) +{ + if (is_finished) + return {}; + + lock.unlock(); + + epoll_event event; + event.data.ptr = nullptr; + int num_events = 0; + + while (num_events == 0) + { + num_events = epoll_wait(epoll_fd, &event, 1, 0); + if (num_events == -1) + throwFromErrno("Failed to epoll_wait", ErrorCodes::CANNOT_READ_FROM_SOCKET); + } + + lock.lock(); + + if (event.data.ptr == pipe_fd) + return {}; + + std::uintptr_t key = reinterpret_cast(event.data.ptr); + auto it = tasks.find(key); + if (it == tasks.end()) + throw Exception("Task was not found in task queue", ErrorCodes::LOGICAL_ERROR); + + auto res = it->second; + tasks.erase(it); + + if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_DEL, res.fd, &event)) + throwFromErrno("Cannot remove socket descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE); + + return res; +} + +void PollingQueue::finish() +{ + is_finished = true; + tasks.clear(); + + uint64_t buf = 0; + while (-1 == write(pipe_fd[1], &buf, sizeof(buf))) + { + if (errno == EAGAIN) + break; + + if (errno != EINTR) + throwFromErrno("Cannot write to pipe", ErrorCodes::CANNOT_READ_FROM_SOCKET); + } +} + +} +#endif diff --git a/src/Processors/Executors/PollingQueue.h b/src/Processors/Executors/PollingQueue.h new file mode 100644 index 00000000000..9d37bf0a2cc --- /dev/null +++ b/src/Processors/Executors/PollingQueue.h @@ -0,0 +1,60 @@ +#pragma once +#include +#include +#include +#include +#include + +namespace DB +{ + +#if defined(OS_LINUX) + +/// This queue is used to poll descriptors. Generally, just a wrapper over epoll. +class PollingQueue +{ +public: + struct TaskData + { + size_t thread_num; + + void * data = nullptr; + int fd = -1; + + explicit operator bool() const { return data; } + }; + +private: + int epoll_fd; + int pipe_fd[2]; + std::atomic_bool is_finished = false; + std::unordered_map tasks; + +public: + PollingQueue(); + ~PollingQueue(); + + size_t size() const { return tasks.size(); } + bool empty() const { return tasks.empty(); } + + /// Add new task to queue. + void addTask(size_t thread_number, void * data, int fd); + + /// Wait for any descriptor. If no descriptors in queue, blocks. + /// Returns ptr which was inserted into queue or nullptr if finished was called. + /// Lock is unlocked during waiting. + TaskData wait(std::unique_lock & lock); + + /// Interrupt waiting. + void finish(); +}; +#else +class PollingQueue +{ +public: + bool empty() { return true; } + void finish() {} +}; +#endif + +} diff --git a/src/Processors/Formats/InputStreamFromInputFormat.h b/src/Processors/Formats/InputStreamFromInputFormat.h index 792d2e45f7f..4369287d39e 100644 --- a/src/Processors/Formats/InputStreamFromInputFormat.h +++ b/src/Processors/Formats/InputStreamFromInputFormat.h @@ -56,7 +56,6 @@ protected: case IProcessor::Status::NeedData: case IProcessor::Status::Async: - case IProcessor::Status::Wait: case IProcessor::Status::ExpandPipeline: throw Exception("Source processor returned status " + IProcessor::statusToName(status), ErrorCodes::LOGICAL_ERROR); } diff --git a/src/Processors/IProcessor.cpp b/src/Processors/IProcessor.cpp index 36beeea8476..a2533ee4c8c 100644 --- a/src/Processors/IProcessor.cpp +++ b/src/Processors/IProcessor.cpp @@ -33,8 +33,6 @@ std::string IProcessor::statusToName(Status status) return "Ready"; case Status::Async: return "Async"; - case Status::Wait: - return "Wait"; case Status::ExpandPipeline: return "ExpandPipeline"; } diff --git a/src/Processors/IProcessor.h b/src/Processors/IProcessor.h index c774b43a9b2..c3abe40c3b7 100644 --- a/src/Processors/IProcessor.h +++ b/src/Processors/IProcessor.h @@ -146,13 +146,10 @@ public: /// You may call 'work' method and processor will do some work synchronously. Ready, - /// You may call 'schedule' method and processor will initiate some background work. + /// You may call 'schedule' method and processor will return descriptor. + /// You need to poll this descriptor and call work() afterwards. Async, - /// Processor is doing some work in background. - /// You may wait for next event or do something else and then you should call 'prepare' again. - Wait, - /// Processor wants to add other processors to pipeline. /// New processors must be obtained by expandPipeline() call. ExpandPipeline, @@ -198,16 +195,21 @@ public: throw Exception("Method 'work' is not implemented for " + getName() + " processor", ErrorCodes::NOT_IMPLEMENTED); } - /** You may call this method if 'prepare' returned Async. + /** Executor must call this method when 'prepare' returned Async. * This method cannot access any ports. It should use only data that was prepared by 'prepare' method. * - * This method should return instantly and fire an event (or many events) when asynchronous job will be done. - * When the job is not done, method 'prepare' will return Wait and the user may block and wait for next event before checking again. + * This method should instantly return epollable file descriptor which will be readable when asynchronous job is done. + * When descriptor is readable, method `work` is called to continue data processing. * - * Note that it can fire many events in EventCounter while doing its job, - * and you have to wait for next event (or do something else) every time when 'prepare' returned Wait. + * NOTE: it would be more logical to let `work()` return ASYNC status instead of prepare. This will get + * prepare() -> work() -> schedule() -> work() -> schedule() -> .. -> work() -> prepare() + * chain instead of + * prepare() -> work() -> prepare() -> schedule() -> work() -> prepare() -> schedule() -> .. -> work() -> prepare() + * + * It is expected that executor epoll using level-triggered notifications. + * Read all available data from descriptor before returning ASYNC. */ - virtual void schedule(EventCounter & /*watch*/) + virtual int schedule() { throw Exception("Method 'schedule' is not implemented for " + getName() + " processor", ErrorCodes::NOT_IMPLEMENTED); } diff --git a/src/Processors/ISource.cpp b/src/Processors/ISource.cpp index 90f3962b83e..7ae988f7cdb 100644 --- a/src/Processors/ISource.cpp +++ b/src/Processors/ISource.cpp @@ -4,6 +4,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + ISource::ISource(Block header) : IProcessor({}, {std::move(header)}), output(outputs.front()) { @@ -45,11 +50,17 @@ void ISource::work() { try { - current_chunk.chunk = generate(); - if (!current_chunk.chunk || isCancelled()) - finished = true; + if (auto chunk = tryGenerate()) + { + current_chunk.chunk = std::move(*chunk); + if (current_chunk.chunk) + has_input = true; + } else - has_input = true; + finished = true; + + if (isCancelled()) + finished = true; } catch (...) { @@ -58,5 +69,19 @@ void ISource::work() } } +Chunk ISource::generate() +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "generate is not implemented for {}", getName()); +} + +std::optional ISource::tryGenerate() +{ + auto chunk = generate(); + if (!chunk) + return {}; + + return chunk; +} + } diff --git a/src/Processors/ISource.h b/src/Processors/ISource.h index 9be21c3a398..b7e2b5dce8e 100644 --- a/src/Processors/ISource.h +++ b/src/Processors/ISource.h @@ -15,7 +15,8 @@ protected: bool got_exception = false; Port::Data current_chunk; - virtual Chunk generate() = 0; + virtual Chunk generate(); + virtual std::optional tryGenerate(); public: ISource(Block header); diff --git a/src/Processors/QueryPlan/WindowStep.cpp b/src/Processors/QueryPlan/WindowStep.cpp new file mode 100644 index 00000000000..82c589b8b20 --- /dev/null +++ b/src/Processors/QueryPlan/WindowStep.cpp @@ -0,0 +1,111 @@ +#include + +#include +#include +#include +#include +#include + +namespace DB +{ + +static ITransformingStep::Traits getTraits() +{ + return ITransformingStep::Traits + { + { + .preserves_distinct_columns = true, + .returns_single_stream = false, + .preserves_number_of_streams = true, + .preserves_sorting = true, + }, + { + .preserves_number_of_rows = true + } + }; +} + +static Block addWindowFunctionResultColumns(const Block & block, + std::vector window_functions) +{ + auto result = block; + + for (const auto & f : window_functions) + { + ColumnWithTypeAndName column_with_type; + column_with_type.name = f.column_name; + column_with_type.type = f.aggregate_function->getReturnType(); + column_with_type.column = column_with_type.type->createColumn(); + + result.insert(column_with_type); + } + + return result; +} + +WindowStep::WindowStep(const DataStream & input_stream_, + const WindowDescription & window_description_, + const std::vector & window_functions_) + : ITransformingStep( + input_stream_, + addWindowFunctionResultColumns(input_stream_.header, + window_functions_), + getTraits()) + , window_description(window_description_) + , window_functions(window_functions_) + , input_header(input_stream_.header) +{ + // We don't remove any columns, only add, so probably we don't have to update + // the output DataStream::distinct_columns. +} + +void WindowStep::transformPipeline(QueryPipeline & pipeline) +{ + pipeline.addSimpleTransform([&](const Block & /*header*/) + { + return std::make_shared(input_header, + output_stream->header, window_description, window_functions); + }); + + assertBlocksHaveEqualStructure(pipeline.getHeader(), output_stream->header, + "WindowStep transform for '" + window_description.window_name + "'"); +} + +void WindowStep::describeActions(FormatSettings & settings) const +{ + String prefix(settings.offset, ' '); + settings.out << prefix << "Window: ("; + if (!window_description.partition_by.empty()) + { + settings.out << "PARTITION BY "; + for (size_t i = 0; i < window_description.partition_by.size(); ++i) + { + if (i > 0) + { + settings.out << ", "; + } + + settings.out << window_description.partition_by[i].column_name; + } + } + if (!window_description.partition_by.empty() + && !window_description.order_by.empty()) + { + settings.out << " "; + } + if (!window_description.order_by.empty()) + { + settings.out << "ORDER BY " + << dumpSortDescription(window_description.order_by); + } + settings.out << ")\n"; + + for (size_t i = 0; i < window_functions.size(); ++i) + { + settings.out << prefix << (i == 0 ? "Functions: " + : " "); + settings.out << window_functions[i].column_name << "\n"; + } +} + +} diff --git a/src/Processors/QueryPlan/WindowStep.h b/src/Processors/QueryPlan/WindowStep.h new file mode 100644 index 00000000000..069d02c655c --- /dev/null +++ b/src/Processors/QueryPlan/WindowStep.h @@ -0,0 +1,33 @@ +#pragma once +#include + +#include + +namespace DB +{ + +class ActionsDAG; +using ActionsDAGPtr = std::shared_ptr; + +class WindowTransform; + +class WindowStep : public ITransformingStep +{ +public: + explicit WindowStep(const DataStream & input_stream_, + const WindowDescription & window_description_, + const std::vector & window_functions_); + + String getName() const override { return "Window"; } + + void transformPipeline(QueryPipeline & pipeline) override; + + void describeActions(FormatSettings & settings) const override; + +private: + WindowDescription window_description; + std::vector window_functions; + Block input_header; +}; + +} diff --git a/src/Processors/Sources/RemoteSource.cpp b/src/Processors/Sources/RemoteSource.cpp index 2efbf3ed7c2..bf3ef32214d 100644 --- a/src/Processors/Sources/RemoteSource.cpp +++ b/src/Processors/Sources/RemoteSource.cpp @@ -1,14 +1,16 @@ #include #include +#include #include #include namespace DB { -RemoteSource::RemoteSource(RemoteQueryExecutorPtr executor, bool add_aggregation_info_) +RemoteSource::RemoteSource(RemoteQueryExecutorPtr executor, bool add_aggregation_info_, bool async_read_) : SourceWithProgress(executor->getHeader(), false) , add_aggregation_info(add_aggregation_info_), query_executor(std::move(executor)) + , async_read(async_read_) { /// Add AggregatedChunkInfo if we expect DataTypeAggregateFunction as a result. const auto & sample = getPort().getHeader(); @@ -21,15 +23,28 @@ RemoteSource::~RemoteSource() = default; ISource::Status RemoteSource::prepare() { + /// Check if query was cancelled before returning Async status. Otherwise it may lead to infinite loop. + if (was_query_canceled) + { + getPort().finish(); + return Status::Finished; + } + + if (is_async_state) + return Status::Async; + Status status = SourceWithProgress::prepare(); /// To avoid resetting the connection (because of "unfinished" query) in the /// RemoteQueryExecutor it should be finished explicitly. if (status == Status::Finished) - query_executor->finish(); + { + query_executor->finish(&read_context); + is_async_state = false; + } return status; } -Chunk RemoteSource::generate() +std::optional RemoteSource::tryGenerate() { /// onCancel() will do the cancel if the query was sent. if (was_query_canceled) @@ -52,11 +67,28 @@ Chunk RemoteSource::generate() was_query_sent = true; } - auto block = query_executor->read(); + Block block; + + if (async_read) + { + auto res = query_executor->read(read_context); + if (std::holds_alternative(res)) + { + fd = std::get(res); + is_async_state = true; + return Chunk(); + } + + is_async_state = false; + + block = std::get(std::move(res)); + } + else + block = query_executor->read(); if (!block) { - query_executor->finish(); + query_executor->finish(&read_context); return {}; } @@ -77,7 +109,18 @@ Chunk RemoteSource::generate() void RemoteSource::onCancel() { was_query_canceled = true; - query_executor->cancel(); + query_executor->cancel(&read_context); + // is_async_state = false; +} + +void RemoteSource::onUpdatePorts() +{ + if (getPort().isFinished()) + { + was_query_canceled = true; + query_executor->finish(&read_context); + // is_async_state = false; + } } @@ -123,9 +166,9 @@ Chunk RemoteExtremesSource::generate() Pipe createRemoteSourcePipe( RemoteQueryExecutorPtr query_executor, - bool add_aggregation_info, bool add_totals, bool add_extremes) + bool add_aggregation_info, bool add_totals, bool add_extremes, bool async_read) { - Pipe pipe(std::make_shared(query_executor, add_aggregation_info)); + Pipe pipe(std::make_shared(query_executor, add_aggregation_info, async_read)); if (add_totals) pipe.addTotalsSource(std::make_shared(query_executor)); diff --git a/src/Processors/Sources/RemoteSource.h b/src/Processors/Sources/RemoteSource.h index 7b537023306..2109cb5eba4 100644 --- a/src/Processors/Sources/RemoteSource.h +++ b/src/Processors/Sources/RemoteSource.h @@ -11,6 +11,8 @@ namespace DB class RemoteQueryExecutor; using RemoteQueryExecutorPtr = std::shared_ptr; +class RemoteQueryExecutorReadContext; + /// Source from RemoteQueryExecutor. Executes remote query and returns query result chunks. class RemoteSource : public SourceWithProgress { @@ -18,7 +20,7 @@ public: /// Flag add_aggregation_info tells if AggregatedChunkInfo should be added to result chunk. /// AggregatedChunkInfo stores the bucket number used for two-level aggregation. /// This flag should be typically enabled for queries with GROUP BY which are executed till WithMergeableState. - RemoteSource(RemoteQueryExecutorPtr executor, bool add_aggregation_info_); + RemoteSource(RemoteQueryExecutorPtr executor, bool add_aggregation_info_, bool async_read_); ~RemoteSource() override; Status prepare() override; @@ -27,14 +29,12 @@ public: void setRowsBeforeLimitCounter(RowsBeforeLimitCounterPtr counter) { rows_before_limit.swap(counter); } /// Stop reading from stream if output port is finished. - void onUpdatePorts() override - { - if (getPort().isFinished()) - cancel(); - } + void onUpdatePorts() override; + + int schedule() override { return fd; } protected: - Chunk generate() override; + std::optional tryGenerate() override; void onCancel() override; private: @@ -43,6 +43,11 @@ private: bool add_aggregation_info = false; RemoteQueryExecutorPtr query_executor; RowsBeforeLimitCounterPtr rows_before_limit; + + const bool async_read; + bool is_async_state = false; + std::unique_ptr read_context; + int fd = -1; }; /// Totals source from RemoteQueryExecutor. @@ -80,6 +85,6 @@ private: /// Create pipe with remote sources. Pipe createRemoteSourcePipe( RemoteQueryExecutorPtr query_executor, - bool add_aggregation_info, bool add_totals, bool add_extremes); + bool add_aggregation_info, bool add_totals, bool add_extremes, bool async_read); } diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp new file mode 100644 index 00000000000..6e8b0ea8e39 --- /dev/null +++ b/src/Processors/Transforms/WindowTransform.cpp @@ -0,0 +1,184 @@ +#include + +#include + +#include + +namespace DB +{ + +WindowTransform::WindowTransform(const Block & input_header_, + const Block & output_header_, + const WindowDescription & window_description_, + const std::vector & window_function_descriptions + ) + : ISimpleTransform(input_header_, output_header_, + false /* skip_empty_chunks */) + , input_header(input_header_) + , window_description(window_description_) +{ + workspaces.reserve(window_function_descriptions.size()); + for (const auto & f : window_function_descriptions) + { + WindowFunctionWorkspace workspace; + workspace.window_function = f; + + const auto & aggregate_function + = workspace.window_function.aggregate_function; + if (!arena && aggregate_function->allocatesMemoryInArena()) + { + arena = std::make_unique(); + } + + workspace.argument_column_indices.reserve( + workspace.window_function.argument_names.size()); + workspace.argument_columns.reserve( + workspace.window_function.argument_names.size()); + for (const auto & argument_name : workspace.window_function.argument_names) + { + workspace.argument_column_indices.push_back( + input_header.getPositionByName(argument_name)); + } + + workspace.aggregate_function_state.reset(aggregate_function->sizeOfData(), + aggregate_function->alignOfData()); + aggregate_function->create(workspace.aggregate_function_state.data()); + + workspaces.push_back(std::move(workspace)); + } + + partition_by_indices.reserve(window_description.partition_by.size()); + for (const auto & column : window_description.partition_by) + { + partition_by_indices.push_back( + input_header.getPositionByName(column.column_name)); + } + partition_start_columns.resize(partition_by_indices.size(), nullptr); + partition_start_row = 0; +} + +WindowTransform::~WindowTransform() +{ + // Some states may be not created yet if the creation failed. + for (auto & ws : workspaces) + { + ws.window_function.aggregate_function->destroy( + ws.aggregate_function_state.data()); + } +} + +void WindowTransform::transform(Chunk & chunk) +{ + const size_t num_rows = chunk.getNumRows(); + auto columns = chunk.detachColumns(); + + for (auto & ws : workspaces) + { + ws.argument_columns.clear(); + for (const auto column_index : ws.argument_column_indices) + { + ws.argument_columns.push_back(columns[column_index].get()); + } + + ws.result_column = ws.window_function.aggregate_function->getReturnType() + ->createColumn(); + } + + // We loop for all window functions for each row. Switching the loops might + // be more efficient, because we would run less code and access less data in + // the inner loop. If you change this, don't forget to fix the calculation of + // partition boundaries. Probably it has to be precalculated and stored as + // an array of offsets. An interesting optimization would be to pass it as + // an extra column from the previous sorting step -- that step might need to + // make similar comparison anyway, if it's sorting only by the PARTITION BY + // columns. + for (size_t row = 0; row < num_rows; row++) + { + // Check whether the new partition has started. We have to reset the + // aggregate functions when the new partition starts. + assert(partition_start_columns.size() == partition_by_indices.size()); + bool new_partition = false; + if (partition_start_columns.empty()) + { + // No PARTITION BY at all, do nothing. + } + else if (partition_start_columns[0] == nullptr) + { + // This is the first partition. + new_partition = true; + partition_start_columns.clear(); + for (const auto i : partition_by_indices) + { + partition_start_columns.push_back(columns[i]); + } + partition_start_row = row; + } + else + { + // Check whether the new partition started, by comparing all the + // PARTITION BY columns. + size_t first_inequal_column = 0; + for (; first_inequal_column < partition_start_columns.size(); + ++first_inequal_column) + { + const auto * current_column = columns[ + partition_by_indices[first_inequal_column]].get(); + + if (current_column->compareAt(row, partition_start_row, + *partition_start_columns[first_inequal_column], + 1 /* nan_direction_hint */) != 0) + { + break; + } + } + + if (first_inequal_column < partition_start_columns.size()) + { + // The new partition has started. Remember where. + new_partition = true; + partition_start_columns.clear(); + for (const auto i : partition_by_indices) + { + partition_start_columns.push_back(columns[i]); + } + partition_start_row = row; + } + } + + for (auto & ws : workspaces) + { + const auto & f = ws.window_function; + const auto * a = f.aggregate_function.get(); + auto * buf = ws.aggregate_function_state.data(); + + if (new_partition) + { + // Reset the aggregate function states. + a->destroy(buf); + a->create(buf); + } + + // Update the aggregate function state and save the result. + a->add(buf, + ws.argument_columns.data(), + row, + arena.get()); + + a->insertResultInto(buf, + *ws.result_column, + arena.get()); + } + } + + // We have to release the mutable reference to the result column before we + // return this block, or else extra copying may occur when the subsequent + // processors modify the block. Workspaces live longer than individual blocks. + for (auto & ws : workspaces) + { + columns.push_back(std::move(ws.result_column)); + } + + chunk.setColumns(std::move(columns), num_rows); +} + +} diff --git a/src/Processors/Transforms/WindowTransform.h b/src/Processors/Transforms/WindowTransform.h new file mode 100644 index 00000000000..3d284263171 --- /dev/null +++ b/src/Processors/Transforms/WindowTransform.h @@ -0,0 +1,77 @@ +#pragma once +#include + +#include + +#include + +namespace DB +{ + +class ExpressionActions; +using ExpressionActionsPtr = std::shared_ptr; + +class Arena; + +// Runtime data for computing one window function +struct WindowFunctionWorkspace +{ + WindowFunctionDescription window_function; + AlignedBuffer aggregate_function_state; + std::vector argument_column_indices; + + // Argument and result columns. Be careful, they are per-chunk. + std::vector argument_columns; + MutableColumnPtr result_column; +}; + +/* + * Computes several window functions that share the same window. The input must + * be sorted correctly for this window (PARTITION BY, then ORDER BY). + */ +class WindowTransform : public ISimpleTransform +{ +public: + WindowTransform( + const Block & input_header_, + const Block & output_header_, + const WindowDescription & window_description_, + const std::vector & + window_function_descriptions); + + ~WindowTransform() override; + + String getName() const override + { + return "WindowTransform"; + } + + static Block transformHeader(Block header, const ExpressionActionsPtr & expression); + + void transform(Chunk & chunk) override; + +public: + Block input_header; + + WindowDescription window_description; + + // Indices of the PARTITION BY columns in block. + std::vector partition_by_indices; + + // The columns for PARTITION BY and the row in these columns where the + // current partition started. They might be in some of the previous blocks, + // so we have to keep the shared ownership of the columns. We don't keep the + // entire block to save memory, only the needed columns, in the same order + // as the partition_by_indices array. + // Can be empty if there is no PARTITION BY. + // Columns are nullptr when it is the first partition. + std::vector partition_start_columns; + size_t partition_start_row = 0; + + // Data for computing the window functions. + std::vector workspaces; + + std::unique_ptr arena; +}; + +} diff --git a/src/Processors/ya.make b/src/Processors/ya.make index eacc0c44d24..263c24ff35c 100644 --- a/src/Processors/ya.make +++ b/src/Processors/ya.make @@ -17,6 +17,7 @@ SRCS( Executors/ExecutingGraph.cpp Executors/PipelineExecutingBlockInputStream.cpp Executors/PipelineExecutor.cpp + Executors/PollingQueue.cpp Executors/PullingAsyncPipelineExecutor.cpp Executors/PullingPipelineExecutor.cpp ForkProcessor.cpp @@ -123,6 +124,7 @@ SRCS( QueryPlan/SettingQuotaAndLimitsStep.cpp QueryPlan/TotalsHavingStep.cpp QueryPlan/UnionStep.cpp + QueryPlan/WindowStep.cpp ResizeProcessor.cpp Sources/DelayedSource.cpp Sources/RemoteSource.cpp @@ -155,6 +157,7 @@ SRCS( Transforms/RollupTransform.cpp Transforms/SortingTransform.cpp Transforms/TotalsHavingTransform.cpp + Transforms/WindowTransform.cpp printPipeline.cpp ) diff --git a/src/Storages/IStorage.cpp b/src/Storages/IStorage.cpp index 94f37254b91..5f500518516 100644 --- a/src/Storages/IStorage.cpp +++ b/src/Storages/IStorage.cpp @@ -3,21 +3,22 @@ #include #include -#include +#include +#include +#include +#include +#include +#include +#include #include #include #include #include -#include -#include -#include -#include -#include +#include namespace DB { - namespace ErrorCodes { extern const int TABLE_IS_DROPPED; @@ -32,17 +33,18 @@ bool IStorage::isVirtualColumn(const String & column_name, const StorageMetadata } RWLockImpl::LockHolder IStorage::tryLockTimed( - const RWLock & rwlock, RWLockImpl::Type type, const String & query_id, const std::chrono::milliseconds & acquire_timeout) const + const RWLock & rwlock, RWLockImpl::Type type, const String & query_id, const std::chrono::milliseconds & acquire_timeout) const { auto lock_holder = rwlock->getLock(type, query_id, acquire_timeout); if (!lock_holder) { const String type_str = type == RWLockImpl::Type::Read ? "READ" : "WRITE"; throw Exception( - type_str + " locking attempt on \"" + getStorageID().getFullTableName() + - "\" has timed out! (" + std::to_string(acquire_timeout.count()) + "ms) " - "Possible deadlock avoided. Client should retry.", - ErrorCodes::DEADLOCK_AVOIDED); + type_str + " locking attempt on \"" + getStorageID().getFullTableName() + "\" has timed out! (" + + std::to_string(acquire_timeout.count()) + + "ms) " + "Possible deadlock avoided. Client should retry.", + ErrorCodes::DEADLOCK_AVOIDED); } return lock_holder; } @@ -82,26 +84,26 @@ TableExclusiveLockHolder IStorage::lockExclusively(const String & query_id, cons } Pipe IStorage::read( - const Names & /*column_names*/, - const StorageMetadataPtr & /*metadata_snapshot*/, - SelectQueryInfo & /*query_info*/, - const Context & /*context*/, - QueryProcessingStage::Enum /*processed_stage*/, - size_t /*max_block_size*/, - unsigned /*num_streams*/) + const Names & /*column_names*/, + const StorageMetadataPtr & /*metadata_snapshot*/, + SelectQueryInfo & /*query_info*/, + const Context & /*context*/, + QueryProcessingStage::Enum /*processed_stage*/, + size_t /*max_block_size*/, + unsigned /*num_streams*/) { throw Exception("Method read is not supported by storage " + getName(), ErrorCodes::NOT_IMPLEMENTED); } void IStorage::read( - QueryPlan & query_plan, - const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, - SelectQueryInfo & query_info, - const Context & context, - QueryProcessingStage::Enum processed_stage, - size_t max_block_size, - unsigned num_streams) + QueryPlan & query_plan, + const Names & column_names, + const StorageMetadataPtr & metadata_snapshot, + SelectQueryInfo & query_info, + const Context & context, + QueryProcessingStage::Enum processed_stage, + size_t max_block_size, + unsigned num_streams) { auto pipe = read(column_names, metadata_snapshot, query_info, context, processed_stage, max_block_size, num_streams); if (pipe.empty()) @@ -117,15 +119,12 @@ void IStorage::read( } Pipe IStorage::alterPartition( - const StorageMetadataPtr & /* metadata_snapshot */, - const PartitionCommands & /* commands */, - const Context & /* context */) + const StorageMetadataPtr & /* metadata_snapshot */, const PartitionCommands & /* commands */, const Context & /* context */) { throw Exception("Partition operations are not supported by storage " + getName(), ErrorCodes::NOT_IMPLEMENTED); } -void IStorage::alter( - const AlterCommands & params, const Context & context, TableLockHolder &) +void IStorage::alter(const AlterCommands & params, const Context & context, TableLockHolder &) { auto table_id = getStorageID(); StorageInMemoryMetadata new_metadata = getInMemoryMetadata(); @@ -146,7 +145,8 @@ void IStorage::checkAlterIsPossible(const AlterCommands & commands, const Settin } } -void IStorage::checkAlterPartitionIsPossible(const PartitionCommands & /*commands*/, const StorageMetadataPtr & /*metadata_snapshot*/, const Settings & /*settings*/) const +void IStorage::checkAlterPartitionIsPossible( + const PartitionCommands & /*commands*/, const StorageMetadataPtr & /*metadata_snapshot*/, const Settings & /*settings*/) const { throw Exception("Table engine " + getName() + " doesn't support partitioning", ErrorCodes::NOT_IMPLEMENTED); } @@ -168,4 +168,52 @@ NamesAndTypesList IStorage::getVirtuals() const return {}; } +Names IStorage::getAllRegisteredNames() const +{ + Names result; + auto getter = [](const auto & column) { return column.name; }; + const NamesAndTypesList & available_columns = getInMemoryMetadata().getColumns().getAllPhysical(); + std::transform(available_columns.begin(), available_columns.end(), std::back_inserter(result), getter); + return result; +} + +std::string PrewhereDAGInfo::dump() const +{ + WriteBufferFromOwnString ss; + ss << "PrewhereDagInfo\n"; + + if (alias_actions) + { + ss << "alias_actions " << alias_actions->dumpDAG() << "\n"; + } + + if (prewhere_actions) + { + ss << "prewhere_actions " << prewhere_actions->dumpDAG() << "\n"; + } + + if (remove_columns_actions) + { + ss << "remove_columns_actions " << remove_columns_actions->dumpDAG() << "\n"; + } + + ss << "remove_prewhere_column " << remove_prewhere_column + << ", need_filter " << need_filter << "\n"; + + return ss.str(); +} + +std::string FilterInfo::dump() const +{ + WriteBufferFromOwnString ss; + ss << "FilterInfo for column '" << column_name <<"', do_remove_column " + << do_remove_column << "\n"; + if (actions_dag) + { + ss << "actions_dag " << actions_dag->dumpDAG() << "\n"; + } + + return ss.str(); +} + } diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index c0225f4bb09..ba74a09b6ec 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -78,7 +78,7 @@ struct ColumnSize * - data storage structure (compression, etc.) * - concurrent access to data (locks, etc.) */ -class IStorage : public std::enable_shared_from_this, public TypePromotion +class IStorage : public std::enable_shared_from_this, public TypePromotion, public IHints<1, IStorage> { public: IStorage() = delete; @@ -87,7 +87,6 @@ public: : storage_id(std::move(storage_id_)) , metadata(std::make_unique()) {} //-V730 - virtual ~IStorage() = default; IStorage(const IStorage &) = delete; IStorage & operator=(const IStorage &) = delete; @@ -172,6 +171,7 @@ public: /// By default return empty list of columns. virtual NamesAndTypesList getVirtuals() const; + Names getAllRegisteredNames() const override; protected: /// Returns whether the column is virtual - by default all columns are real. diff --git a/src/Storages/IndicesDescription.cpp b/src/Storages/IndicesDescription.cpp index 8adf2be1bd4..dbc95615383 100644 --- a/src/Storages/IndicesDescription.cpp +++ b/src/Storages/IndicesDescription.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include diff --git a/src/Storages/Kafka/KafkaBlockInputStream.cpp b/src/Storages/Kafka/KafkaBlockInputStream.cpp index 6a137bd4b8b..bf985902b4d 100644 --- a/src/Storages/Kafka/KafkaBlockInputStream.cpp +++ b/src/Storages/Kafka/KafkaBlockInputStream.cpp @@ -125,7 +125,6 @@ Block KafkaBlockInputStream::readImpl() } case IProcessor::Status::NeedData: case IProcessor::Status::Async: - case IProcessor::Status::Wait: case IProcessor::Status::ExpandPipeline: throw Exception("Source processor returned status " + IProcessor::statusToName(status), ErrorCodes::LOGICAL_ERROR); } diff --git a/src/Storages/MergeTree/MergeTreeIndices.h b/src/Storages/MergeTree/MergeTreeIndices.h index 28795ae46b5..c7b9dfb123e 100644 --- a/src/Storages/MergeTree/MergeTreeIndices.h +++ b/src/Storages/MergeTree/MergeTreeIndices.h @@ -10,7 +10,6 @@ #include #include #include -#include #include constexpr auto INDEX_FILE_PREFIX = "skp_idx_"; diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index 2f3931786a6..f6401e0a4b8 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -111,9 +111,6 @@ struct Settings; M(Bool, remove_empty_parts, true, "Remove empty parts after they were pruned by TTL, mutation, or collapsing merge algorithm", 0) \ M(Bool, assign_part_uuids, false, "Generate UUIDs for parts. Before enabling check that all replicas support new format.", 0) \ \ - /** Settings for testing purposes */ \ - M(Bool, randomize_part_type, false, "For testing purposes only. Randomizes part type between wide and compact", 0) \ - \ /** Obsolete settings. Kept for backward compatibility only. */ \ M(UInt64, min_relative_delay_to_yield_leadership, 120, "Obsolete setting, does nothing.", 0) \ M(UInt64, check_delay_period, 60, "Obsolete setting, does nothing.", 0) \ diff --git a/src/Storages/MergeTree/registerStorageMergeTree.cpp b/src/Storages/MergeTree/registerStorageMergeTree.cpp index 91bf105af74..a2429cead3d 100644 --- a/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -234,25 +234,6 @@ If you use the Replicated version of engines, see https://clickhouse.tech/docs/e } -static void randomizePartTypeSettings(const std::unique_ptr & storage_settings) -{ - static constexpr auto MAX_THRESHOLD_FOR_ROWS = 100000; - static constexpr auto MAX_THRESHOLD_FOR_BYTES = 1024 * 1024 * 10; - - /// Create all parts in wide format with probability 1/3. - if (thread_local_rng() % 3 == 0) - { - storage_settings->min_rows_for_wide_part = 0; - storage_settings->min_bytes_for_wide_part = 0; - } - else - { - storage_settings->min_rows_for_wide_part = std::uniform_int_distribution{0, MAX_THRESHOLD_FOR_ROWS}(thread_local_rng); - storage_settings->min_bytes_for_wide_part = std::uniform_int_distribution{0, MAX_THRESHOLD_FOR_BYTES}(thread_local_rng); - } -} - - static StoragePtr create(const StorageFactory::Arguments & args) { /** [Replicated][|Summing|Collapsing|Aggregating|Replacing|Graphite]MergeTree (2 * 7 combinations) engines @@ -737,20 +718,6 @@ static StoragePtr create(const StorageFactory::Arguments & args) ++arg_num; } - /// Allow to randomize part type for tests to cover more cases. - /// But if settings were set explicitly restrict it. - if (storage_settings->randomize_part_type - && !storage_settings->min_rows_for_wide_part.changed - && !storage_settings->min_bytes_for_wide_part.changed) - { - randomizePartTypeSettings(storage_settings); - LOG_INFO(&Poco::Logger::get(args.table_id.getNameForLogs() + " (registerStorageMergeTree)"), - "Applied setting 'randomize_part_type'. " - "Setting 'min_rows_for_wide_part' changed to {}. " - "Setting 'min_bytes_for_wide_part' changed to {}.", - storage_settings->min_rows_for_wide_part, storage_settings->min_bytes_for_wide_part); - } - if (arg_num != arg_cnt) throw Exception("Wrong number of engine arguments.", ErrorCodes::BAD_ARGUMENTS); diff --git a/src/Storages/RabbitMQ/RabbitMQBlockInputStream.cpp b/src/Storages/RabbitMQ/RabbitMQBlockInputStream.cpp index bb8500949fb..c5c43440228 100644 --- a/src/Storages/RabbitMQ/RabbitMQBlockInputStream.cpp +++ b/src/Storages/RabbitMQ/RabbitMQBlockInputStream.cpp @@ -132,7 +132,6 @@ Block RabbitMQBlockInputStream::readImpl() } case IProcessor::Status::NeedData: case IProcessor::Status::Async: - case IProcessor::Status::Wait: case IProcessor::Status::ExpandPipeline: throw Exception("Source processor returned status " + IProcessor::statusToName(status), ErrorCodes::LOGICAL_ERROR); } diff --git a/src/Storages/SelectQueryInfo.h b/src/Storages/SelectQueryInfo.h index 683b2463d1f..5a3ada6288b 100644 --- a/src/Storages/SelectQueryInfo.h +++ b/src/Storages/SelectQueryInfo.h @@ -45,6 +45,8 @@ struct PrewhereDAGInfo PrewhereDAGInfo() = default; explicit PrewhereDAGInfo(ActionsDAGPtr prewhere_actions_, String prewhere_column_name_) : prewhere_actions(std::move(prewhere_actions_)), prewhere_column_name(std::move(prewhere_column_name_)) {} + + std::string dump() const; }; /// Helper struct to store all the information about the filter expression. @@ -53,6 +55,8 @@ struct FilterInfo ActionsDAGPtr actions_dag; String column_name; bool do_remove_column = false; + + std::string dump() const; }; struct InputOrderInfo diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index c81b3538042..8d1bee3e889 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -1124,41 +1124,71 @@ ActionLock StorageMergeTree::stopMergesAndWait() } +MergeTreeDataPartPtr StorageMergeTree::outdatePart(const String & part_name, bool force) +{ + + if (force) + { + /// Forcefully stop merges and make part outdated + auto merge_blocker = stopMergesAndWait(); + auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Committed}); + if (!part) + throw Exception("Part " + part_name + " not found, won't try to drop it.", ErrorCodes::NO_SUCH_DATA_PART); + removePartsFromWorkingSet({part}, true); + return part; + } + else + { + + /// Wait merges selector + std::unique_lock lock(currently_processing_in_background_mutex); + + auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Committed}); + /// It's okay, part was already removed + if (!part) + return nullptr; + + /// Part will be "removed" by merge or mutation, it's OK in case of some + /// background cleanup processes like removing of empty parts. + if (currently_merging_mutating_parts.count(part)) + return nullptr; + + removePartsFromWorkingSet({part}, true); + return part; + } +} + void StorageMergeTree::dropPartition(const ASTPtr & partition, bool detach, bool drop_part, const Context & context, bool throw_if_noop) { { - /// Asks to complete merges and does not allow them to start. - /// This protects against "revival" of data for a removed partition after completion of merge. - auto merge_blocker = stopMergesAndWait(); - - auto metadata_snapshot = getInMemoryMetadataPtr(); - MergeTreeData::DataPartsVector parts_to_remove; + auto metadata_snapshot = getInMemoryMetadataPtr(); if (drop_part) { - String part_name = partition->as().value.safeGet(); - auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Committed}); - - if (part) - parts_to_remove.push_back(part); - else if (throw_if_noop) - throw Exception("Part " + part_name + " not found, won't try to drop it.", ErrorCodes::NO_SUCH_DATA_PART); - else + auto part = outdatePart(partition->as().value.safeGet(), throw_if_noop); + /// Nothing to do, part was removed in some different way + if (!part) return; + + parts_to_remove.push_back(part); } else { + /// Asks to complete merges and does not allow them to start. + /// This protects against "revival" of data for a removed partition after completion of merge. + auto merge_blocker = stopMergesAndWait(); String partition_id = getPartitionIDFromQuery(partition, context); parts_to_remove = getDataPartsVectorInPartition(MergeTreeDataPartState::Committed, partition_id); - } - // TODO should we throw an exception if parts_to_remove is empty? - removePartsFromWorkingSet(parts_to_remove, true); + /// TODO should we throw an exception if parts_to_remove is empty? + removePartsFromWorkingSet(parts_to_remove, true); + } if (detach) { /// If DETACH clone parts to detached/ directory + /// NOTE: no race with background cleanup until we hold pointers to parts for (const auto & part : parts_to_remove) { LOG_INFO(log, "Detaching {}", part->relative_path); diff --git a/src/Storages/StorageMergeTree.h b/src/Storages/StorageMergeTree.h index e6ebad19788..3263f124afa 100644 --- a/src/Storages/StorageMergeTree.h +++ b/src/Storages/StorageMergeTree.h @@ -135,6 +135,10 @@ private: */ bool merge(bool aggressive, const String & partition_id, bool final, bool deduplicate, const Names & deduplicate_by_columns, String * out_disable_reason = nullptr, bool optimize_skip_merged_partitions = false); + /// Make part state outdated and queue it to remove without timeout + /// If force, then stop merges and block them until part state became outdated. Throw exception if part doesn't exists + /// If not force, then take merges selector and check that part is not participating in background operations. + MergeTreeDataPartPtr outdatePart(const String & part_name, bool force); ActionLock stopMergesAndWait(); /// Allocate block number for new mutation, write mutation to disk diff --git a/src/Storages/System/StorageSystemStackTrace.cpp b/src/Storages/System/StorageSystemStackTrace.cpp index 9edcb1ede47..0b5e82a1f3d 100644 --- a/src/Storages/System/StorageSystemStackTrace.cpp +++ b/src/Storages/System/StorageSystemStackTrace.cpp @@ -33,12 +33,24 @@ namespace ErrorCodes namespace { - const pid_t expected_pid = getpid(); + // Initialized in StorageSystemStackTrace's ctor and used in signalHandler. + std::atomic expected_pid; const int sig = SIGRTMIN; std::atomic sequence_num = 0; /// For messages sent via pipe. + std::atomic data_ready_num = 0; + std::atomic signal_latch = false; /// Only need for thread sanitizer. - std::optional stack_trace; + /** Notes: + * Only one query from the table can be processed at the moment of time. + * This is ensured by the mutex in fillData function. + * We obtain information about threads by sending signal and receiving info from the signal handler. + * Information is passed via global variables and pipe is used for signaling. + * Actually we can send all information via pipe, but we read from it with timeout just in case, + * so it's convenient to use is only for signaling. + */ + + StackTrace stack_trace{NoCapture{}}; constexpr size_t max_query_id_size = 128; char query_id_data[max_query_id_size]; @@ -56,25 +68,34 @@ namespace return; /// Signal received too late. - if (info->si_value.sival_int != sequence_num.load(std::memory_order_relaxed)) + int notification_num = info->si_value.sival_int; + if (notification_num != sequence_num.load(std::memory_order_acquire)) + return; + + bool expected = false; + if (!signal_latch.compare_exchange_strong(expected, true, std::memory_order_acquire)) return; /// All these methods are signal-safe. const ucontext_t signal_context = *reinterpret_cast(context); - stack_trace.emplace(signal_context); + stack_trace = StackTrace(signal_context); StringRef query_id = CurrentThread::getQueryId(); query_id_size = std::min(query_id.size, max_query_id_size); if (query_id.data && query_id.size) memcpy(query_id_data, query_id.data, query_id_size); - int notification_num = info->si_value.sival_int; + /// This is unneeded (because we synchronize through pipe) but makes TSan happy. + data_ready_num.store(notification_num, std::memory_order_release); + ssize_t res = ::write(notification_pipe.fds_rw[1], ¬ification_num, sizeof(notification_num)); /// We cannot do anything if write failed. (void)res; errno = saved_errno; + + signal_latch.store(false, std::memory_order_release); } /// Wait for data in pipe and read it. @@ -132,7 +153,7 @@ StorageSystemStackTrace::StorageSystemStackTrace(const StorageID & table_id_) notification_pipe.open(); /// Setup signal handler. - + expected_pid = getpid(); struct sigaction sa{}; sa.sa_sigaction = signalHandler; sa.sa_flags = SA_SIGINFO; @@ -179,7 +200,7 @@ void StorageSystemStackTrace::fillData(MutableColumns & res_columns, const Conte pid_t tid = parse(it->path().filename()); sigval sig_value{}; - sig_value.sival_int = sequence_num.load(std::memory_order_relaxed); + sig_value.sival_int = sequence_num.load(std::memory_order_acquire); if (0 != ::sigqueue(tid, sig, sig_value)) { /// The thread may has been already finished. @@ -191,15 +212,15 @@ void StorageSystemStackTrace::fillData(MutableColumns & res_columns, const Conte /// Just in case we will wait for pipe with timeout. In case signal didn't get processed. - if (wait(100)) + if (wait(100) && sig_value.sival_int == data_ready_num.load(std::memory_order_acquire)) { - size_t stack_trace_size = stack_trace->getSize(); - size_t stack_trace_offset = stack_trace->getOffset(); + size_t stack_trace_size = stack_trace.getSize(); + size_t stack_trace_offset = stack_trace.getOffset(); Array arr; arr.reserve(stack_trace_size - stack_trace_offset); for (size_t i = stack_trace_offset; i < stack_trace_size; ++i) - arr.emplace_back(reinterpret_cast(stack_trace->getFramePointers()[i])); + arr.emplace_back(reinterpret_cast(stack_trace.getFramePointers()[i])); res_columns[0]->insert(tid); res_columns[1]->insertData(query_id_data, query_id_size); @@ -214,7 +235,11 @@ void StorageSystemStackTrace::fillData(MutableColumns & res_columns, const Conte res_columns[2]->insertDefault(); } - ++sequence_num; /// FYI: For signed Integral types, arithmetic is defined to use two’s complement representation. There are no undefined results. + /// Signed integer overflow is undefined behavior in both C and C++. However, according to + /// C++ standard, Atomic signed integer arithmetic is defined to use two's complement; there + /// are no undefined results. See https://en.cppreference.com/w/cpp/atomic/atomic and + /// http://eel.is/c++draft/atomics.types.generic#atomics.types.int-8 + ++sequence_num; } } diff --git a/tests/clickhouse-client.xml b/tests/clickhouse-client.xml new file mode 100644 index 00000000000..b00c16f2c99 --- /dev/null +++ b/tests/clickhouse-client.xml @@ -0,0 +1,3 @@ + + 100000 + \ No newline at end of file diff --git a/tests/clickhouse-test-server b/tests/clickhouse-test-server new file mode 100755 index 00000000000..a6b6bd86dfa --- /dev/null +++ b/tests/clickhouse-test-server @@ -0,0 +1,166 @@ +#!/usr/bin/env bash + +set -x +set -o errexit +set -o pipefail + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && cd ../.. && pwd) +DATA_DIR=${DATA_DIR:=`mktemp -d /tmp/clickhouse.test..XXXXX`} +DATA_DIR_PATTERN=${DATA_DIR_PATTERN:=/tmp/clickhouse} # path from config file, will be replaced to temporary +LOG_DIR=${LOG_DIR:=$DATA_DIR/log} +export CLICKHOUSE_BINARY_NAME=${CLICKHOUSE_BINARY_NAME:="clickhouse"} +( [ -x "$ROOT_DIR/programs/${CLICKHOUSE_BINARY_NAME}-server" ] || [ -x "$ROOT_DIR/programs/${CLICKHOUSE_BINARY_NAME}" ] ) && BUILD_DIR=${BUILD_DIR:=$ROOT_DIR} # Build without separate build dir +[ -d "$ROOT_DIR/build${BUILD_TYPE}" ] && BUILD_DIR=${BUILD_DIR:=$ROOT_DIR/build${BUILD_TYPE}} +BUILD_DIR=${BUILD_DIR:=$ROOT_DIR} +[ -x ${CLICKHOUSE_BINARY_NAME}-server" ] && [ -x ${CLICKHOUSE_BINARY_NAME}-client" ] && BIN_DIR= # Allow run in /usr/bin +( [ -x "$BUILD_DIR/programs/${CLICKHOUSE_BINARY_NAME}" ] || [ -x "$BUILD_DIR/programs/${CLICKHOUSE_BINARY_NAME}-server" ] ) && BIN_DIR=${BIN_DIR:=$BUILD_DIR/programs/} +[ -x "$BIN_DIR/${CLICKHOUSE_BINARY_NAME}-server" ] && CLICKHOUSE_SERVER=${CLICKHOUSE_SERVER:=$BIN_DIR/${CLICKHOUSE_BINARY_NAME}-server} +[ -x "$BIN_DIR/${CLICKHOUSE_BINARY_NAME}" ] && CLICKHOUSE_SERVER=${CLICKHOUSE_SERVER:=$BIN_DIR/${CLICKHOUSE_BINARY_NAME} server} +[ -x "$BIN_DIR/${CLICKHOUSE_BINARY_NAME}-client" ] && CLICKHOUSE_CLIENT=${CLICKHOUSE_CLIENT:=$BIN_DIR/${CLICKHOUSE_BINARY_NAME}-client} +[ -x "$BIN_DIR/${CLICKHOUSE_BINARY_NAME}" ] && CLICKHOUSE_CLIENT=${CLICKHOUSE_CLIENT:=$BIN_DIR/${CLICKHOUSE_BINARY_NAME} client} +[ -x "$BIN_DIR/${CLICKHOUSE_BINARY_NAME}-extract-from-config" ] && CLICKHOUSE_EXTRACT=${CLICKHOUSE_EXTRACT:=$BIN_DIR/${CLICKHOUSE_BINARY_NAME}-extract-from-config} +[ -x "$BIN_DIR/${CLICKHOUSE_BINARY_NAME}" ] && CLICKHOUSE_EXTRACT=${CLICKHOUSE_EXTRACT:=$BIN_DIR/${CLICKHOUSE_BINARY_NAME} extract-from-config} + +[ -f "$CUR_DIR/server-test.xml" ] && CONFIG_DIR=${CONFIG_DIR=$CUR_DIR}/ +CONFIG_CLIENT_DIR=${CONFIG_CLIENT_DIR=$CONFIG_DIR} +CONFIG_SERVER_DIR=${CONFIG_SERVER_DIR=$CONFIG_DIR} +[ ! -f "${CONFIG_CLIENT_DIR}client-test.xml" ] && CONFIG_CLIENT_DIR=${CONFIG_CLIENT_DIR:=/etc/clickhouse-client/} +[ ! -f "${CONFIG_SERVER_DIR}server-test.xml" ] && CONFIG_SERVER_DIR=${CONFIG_SERVER_DIR:=/etc/clickhouse-server/} +export CLICKHOUSE_CONFIG_CLIENT=${CLICKHOUSE_CONFIG_CLIENT:=${CONFIG_CLIENT_DIR}client-test.xml} +export CLICKHOUSE_CONFIG=${CLICKHOUSE_CONFIG:=${CONFIG_SERVER_DIR}server-test.xml} +CLICKHOUSE_CONFIG_USERS=${CONFIG_SERVER_DIR}users.xml +[ ! -f "$CLICKHOUSE_CONFIG_USERS" ] && CLICKHOUSE_CONFIG_USERS=$CUR_DIR/../programs/server/users.xml +CLICKHOUSE_CONFIG_USERS_D=${CONFIG_SERVER_DIR}users.d +[ ! -d "$CLICKHOUSE_CONFIG_USERS_D" ] && CLICKHOUSE_CONFIG_USERS_D=$CUR_DIR/../programs/server/users.d +[ -x "$CUR_DIR/clickhouse-test" ] && TEST_DIR=${TEST_DIR=$CUR_DIR/} +[ -d "$CUR_DIR/queries" ] && QUERIES_DIR=${QUERIES_DIR=$CUR_DIR/queries} +[ ! -d "$QUERIES_DIR" ] && [ -d "/usr/local/share/clickhouse-test/queries" ] && QUERIES_DIR=${QUERIES_DIR=/usr/local/share/clickhouse-test/queries} +[ ! -d "$QUERIES_DIR" ] && [ -d "/usr/share/clickhouse-test/queries" ] && QUERIES_DIR=${QUERIES_DIR=/usr/share/clickhouse-test/queries} + +TEST_PORT_RANDOM=${TEST_PORT_RANDOM=1} +if [ "${TEST_PORT_RANDOM}" ]; then + CLICKHOUSE_PORT_BASE=${CLICKHOUSE_PORT_BASE:=$(( ( RANDOM % 50000 ) + 10000 ))} + CLICKHOUSE_PORT_TCP=${CLICKHOUSE_PORT_TCP:=$(($CLICKHOUSE_PORT_BASE + 1))} + CLICKHOUSE_PORT_HTTP=${CLICKHOUSE_PORT_HTTP:=$(($CLICKHOUSE_PORT_BASE + 2))} + CLICKHOUSE_PORT_INTERSERVER=${CLICKHOUSE_PORT_INTERSERVER:=$(($CLICKHOUSE_PORT_BASE + 3))} + CLICKHOUSE_PORT_TCP_SECURE=${CLICKHOUSE_PORT_TCP_SECURE:=$(($CLICKHOUSE_PORT_BASE + 4))} + CLICKHOUSE_PORT_HTTPS=${CLICKHOUSE_PORT_HTTPS:=$(($CLICKHOUSE_PORT_BASE + 5))} + CLICKHOUSE_PORT_ODBC_BRIDGE=${CLICKHOUSE_ODBC_BRIDGE:=$(($CLICKHOUSE_PORT_BASE + 6))} +fi + +rm -rf $DATA_DIR || true +mkdir -p $LOG_DIR $DATA_DIR/etc || true + +if [ "$DATA_DIR_PATTERN" != "$DATA_DIR" ]; then + cat $CLICKHOUSE_CONFIG | sed -e s!$DATA_DIR_PATTERN!$DATA_DIR! > $DATA_DIR/etc/server-config.xml + export CLICKHOUSE_CONFIG=$DATA_DIR/etc/server-config.xml + cp $CLICKHOUSE_CONFIG_USERS $DATA_DIR/etc + cp -R -L $CLICKHOUSE_CONFIG_USERS_D $DATA_DIR/etc + cat ${CONFIG_SERVER_DIR}/ints_dictionary.xml | sed -e s!9000!$CLICKHOUSE_PORT_TCP! > $DATA_DIR/etc/ints_dictionary.xml + cat ${CONFIG_SERVER_DIR}/strings_dictionary.xml | sed -e s!9000!$CLICKHOUSE_PORT_TCP! > $DATA_DIR/etc/strings_dictionary.xml + cat ${CONFIG_SERVER_DIR}/decimals_dictionary.xml | sed -e s!9000!$CLICKHOUSE_PORT_TCP! > $DATA_DIR/etc/decimals_dictionary.xml +fi + +CLICKHOUSE_EXTRACT_CONFIG=${CLICKHOUSE_EXTRACT_CONFIG:="${CLICKHOUSE_EXTRACT} --config=$CLICKHOUSE_CONFIG"} +CLICKHOUSE_LOG=${CLICKHOUSE_LOG:=${LOG_DIR}clickhouse-server.log} +export CLICKHOUSE_PORT_TCP=${CLICKHOUSE_PORT_TCP:=`$CLICKHOUSE_EXTRACT_CONFIG --key=tcp_port || echo 9000`} +export CLICKHOUSE_PORT_HTTP=${CLICKHOUSE_PORT_HTTP:=`$CLICKHOUSE_EXTRACT_CONFIG --key=http_port || echo 8123`} +export CLICKHOUSE_PORT_INTERSERVER=${CLICKHOUSE_PORT_INTERSERVER:=`$CLICKHOUSE_EXTRACT_CONFIG --key=interserver_http_port || echo 9009`} +export CLICKHOUSE_PORT_TCP_SECURE=${CLICKHOUSE_PORT_TCP_SECURE:=`$CLICKHOUSE_EXTRACT_CONFIG --key=tcp_port_secure`} +export CLICKHOUSE_PORT_HTTPS=${CLICKHOUSE_PORT_HTTPS:=`$CLICKHOUSE_EXTRACT_CONFIG --key=https_port`} +export CLICKHOUSE_ODBC_BRIDGE=${CLICKHOUSE_ODBC_BRIDGE:=`$CLICKHOUSE_EXTRACT_CONFIG --key=odbc_bridge.port || echo 9018`} + +DHPARAM=`$CLICKHOUSE_EXTRACT_CONFIG --key=openSSL.server.dhParamsFile` +PRIVATEKEY=`$CLICKHOUSE_EXTRACT_CONFIG --key=openSSL.server.privateKeyFile` +CERT=`$CLICKHOUSE_EXTRACT_CONFIG --key=openSSL.server.certificateFile` +# Do not generate in case broken extract-config +[ -n "$DHPARAM" ] && openssl dhparam -out $DHPARAM 256 +[ -n "$PRIVATEKEY" ] && [ -n "$CERT" ] && openssl req -subj "/CN=localhost" -new -newkey rsa:2048 -days 365 -nodes -x509 -keyout $PRIVATEKEY -out $CERT + +if [ "$TEST_GDB" ] || [ "$GDB" ]; then + echo -e "run \nset pagination off \nset logging file $LOG_DIR/server.gdb.log \nset logging on \nbacktrace \nthread apply all backtrace \nbacktrace \ndetach \nquit " > $DATA_DIR/gdb.cmd + GDB=${GDB:="gdb -x $DATA_DIR/gdb.cmd --args "} +fi + +# Start a local clickhouse server which will be used to run tests + +# TODO: fix change shard ports: +# --remote_servers.test_shard_localhost_secure.shard.replica.port=$CLICKHOUSE_PORT_TCP_SECURE \ +# --remote_servers.test_shard_localhost.shard.replica.port=$CLICKHOUSE_PORT_TCP \ + +VERSION=`$CLICKHOUSE_CLIENT --version-clean` +# If run from compile dir - use in-place compile binary and headers +[ -n "$BIN_DIR" ] && INTERNAL_COMPILER_PARAMS="--compiler_executable_root=${INTERNAL_COMPILER_BIN_ROOT:=$BUILD_DIR/programs/} --compiler_headers=$BUILD_DIR/programs/clang/headers/$VERSION/ --compiler_headers_root=$BUILD_DIR/programs/clang/headers/$VERSION/" + +$GDB $CLICKHOUSE_SERVER --config-file=$CLICKHOUSE_CONFIG --log=$CLICKHOUSE_LOG $TEST_SERVER_PARAMS -- \ + --http_port=$CLICKHOUSE_PORT_HTTP \ + --tcp_port=$CLICKHOUSE_PORT_TCP \ + --https_port=$CLICKHOUSE_PORT_HTTPS \ + --tcp_port_secure=$CLICKHOUSE_PORT_TCP_SECURE \ + --interserver_http_port=$CLICKHOUSE_PORT_INTERSERVER \ + --odbc_bridge.port=$CLICKHOUSE_ODBC_BRIDGE \ + $INTERNAL_COMPILER_PARAMS \ + $TEST_SERVER_CONFIG_PARAMS \ + 2>&1 > $LOG_DIR/server.stdout.log & +CH_PID=$! +sleep ${TEST_SERVER_STARTUP_WAIT:=5} + +if [ "$GDB" ]; then + # Long symbols read + sleep ${TEST_GDB_SLEEP:=60} +fi + +tail -n50 $LOG_DIR/*.log || true + +# Define needed stuff to kill test clickhouse server after tests completion +function finish { + kill $CH_PID || true + wait + tail -n 50 $LOG_DIR/*.log || true + if [ "$GDB" ]; then + cat $LOG_DIR/server.gdb.log || true + fi + rm -rf $DATA_DIR +} +trap finish EXIT SIGINT SIGQUIT SIGTERM + +# Do tests +if [ -n "$*" ]; then + $* +else + TEST_RUN=${TEST_RUN=1} + TEST_DICT=${TEST_DICT=1} + CLICKHOUSE_CLIENT_QUERY="${CLICKHOUSE_CLIENT} --config ${CLICKHOUSE_CONFIG_CLIENT} --port $CLICKHOUSE_PORT_TCP -m -n -q" + $CLICKHOUSE_CLIENT_QUERY 'SELECT * from system.build_options; SELECT * FROM system.clusters;' + CLICKHOUSE_TEST="env ${TEST_DIR}clickhouse-test --force-color --binary ${BIN_DIR}${CLICKHOUSE_BINARY_NAME} --configclient $CLICKHOUSE_CONFIG_CLIENT --configserver $CLICKHOUSE_CONFIG --tmp $DATA_DIR/tmp --queries $QUERIES_DIR $TEST_OPT0 $TEST_OPT" + if [ "${TEST_RUN_STRESS}" ]; then + # Running test in parallel will fail some results (tests can create/fill/drop same tables) + TEST_NPROC=${TEST_NPROC:=$(( `nproc || sysctl -n hw.ncpu || echo 2` * 2))} + for i in `seq 1 ${TEST_NPROC}`; do + $CLICKHOUSE_TEST --order=random --testname --tmp=$DATA_DIR/tmp/tmp${i} & + done + fi + + if [ "${TEST_RUN_PARALLEL}" ]; then + # Running test in parallel will fail some results (tests can create/fill/drop same tables) + TEST_NPROC=${TEST_NPROC:=$(( `nproc || sysctl -n hw.ncpu || echo 2` * 2))} + for i in `seq 1 ${TEST_NPROC}`; do + $CLICKHOUSE_TEST --testname --tmp=$DATA_DIR/tmp/tmp${i} --database=test${i} --parallel=${i}/${TEST_NPROC} & + done + for job in `jobs -p`; do + #echo wait $job + wait $job || let "FAIL+=1" + done + + #echo $FAIL + if [ "$FAIL" != "0" ]; then + return $FAIL + fi + else + ( [ "$TEST_RUN" ] && $CLICKHOUSE_TEST ) || ${TEST_TRUE:=false} + fi + + $CLICKHOUSE_CLIENT_QUERY "SELECT event, value FROM system.events; SELECT metric, value FROM system.metrics; SELECT metric, value FROM system.asynchronous_metrics;" + $CLICKHOUSE_CLIENT_QUERY "SELECT 'Still alive'" +fi \ No newline at end of file diff --git a/tests/client-test.xml b/tests/client-test.xml new file mode 100644 index 00000000000..ee84d41911f --- /dev/null +++ b/tests/client-test.xml @@ -0,0 +1,17 @@ + + + 59000 + 59440 + + + true + true + sslv2,sslv3 + true + none + + AcceptCertificateHandler + + + + \ No newline at end of file diff --git a/tests/performance/async_remote_read.xml b/tests/performance/async_remote_read.xml new file mode 100644 index 00000000000..7f0ee6473ab --- /dev/null +++ b/tests/performance/async_remote_read.xml @@ -0,0 +1,11 @@ + + + SELECT sum(x) + FROM + ( + SELECT sipHash64(sipHash64(sipHash64(number))) AS x + FROM remote('127.0.0.{{2,3,4,5}}', numbers(10000000)) + ) + SETTINGS max_threads = 2, max_distributed_connections = 2 + + diff --git a/tests/queries/0_stateless/01016_simhash_minhash.reference b/tests/queries/0_stateless/01016_simhash_minhash.reference new file mode 100644 index 00000000000..edd5afc1af7 --- /dev/null +++ b/tests/queries/0_stateless/01016_simhash_minhash.reference @@ -0,0 +1,144 @@ +0 +2718169299 +2718169299 +1315333491 +1099965843 +5746351769509927967 +5746351769509927967 +8347269581771603092 +6041373934059725027 +(17178276249054052155,8864230932371215121) +(14133097226001036899,7985237721476952807) +(14133097226001036899,7985237721476952807) +(4661257206578284012,15229878657590021759) +(3087743741749030713,11631667950302077749) +(11923981719512934676,1193672187225825732) +(11923981719512934676,1193672187225825732) +(17970606678134635272,3825545538448404526) +(9422952829151664974,568010773615758889) +2548869326 +2548869326 +401385678 +401385710 +2652202579 +2652235347 +2984455347 +2984488115 +12804820948382413807 +12804820948919350245 +11651601468065149391 +11651600368014488527 +18377198011227067677 +18233505035951822655 +5501050600367972694 +5501050600367972692 +(8590465925632898311,12699049311112305995) +(8590465925632898311,15828587343885202011) +(8590465925632898311,15824051019631343049) +(8590465925632898311,12699049311222825283) +(217966158370437743,14452995911556652133) +(217966158370437743,14452995911556652133) +(2170210914777151141,5341809779339553313) +(12469866236432988845,5341809779339553313) +(12271157076799061825,5514511977572226426) +(11639913962681153226,2767634094725305612) +(12271157075024394466,17994666970078080114) +(12271157077109587702,13572452308677868240) +(6252006845407214340,13538761942960976531) +(13795977174459370328,6392395597500134035) +(16118993428517222971,13602445809406467) +(16118993428517222971,13602445809406467) +uniqExact 6 +ngramSimhash +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 938403918 +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 904817231 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 904849486 +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 938469966 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 2 938404430 +ngramSimhashCaseInsensitive +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 938453071 +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 938453599 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 2 938404430 +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 636382047 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 938388046 +ngramSimhashUTF8 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 2400625214 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 2669060670 +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 2671174174 +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 2669060798 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 2 2635506238 +ngramSimhashCaseInsensitiveUTF8 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 2984307934 +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 2967514366 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 2715855070 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 2967529694 +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 2984290526 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 2984306910 +wordShingleSimhash +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 2384813566025024242 +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 2393820766427040734 +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 2421405261516400471 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 2384883934767174398 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 2384813567165864670 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 2384813567098766070 +wordShingleSimhashCaseInsensitive +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 11635224793909957342 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 11617192803208139478 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 11617192803208151794 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 11617192803208151766 +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 3006891407629799254 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 11617263171950236406 +wordShingleSimhashUTF8 +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 9097818277104946605 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 9084246141658271116 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 9084247241171471628 +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 9088752215857929613 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 9093255814816009484 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 9084247481822285196 +wordShingleSimhashCaseInsensitiveUTF8 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 14788772559981154978 +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 14497164445320454820 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 14500537785782895266 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 14787646625647636642 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 14500016612976573090 +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 14787956717160870888 +ngramMinhash +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 (15568933215262012353,16287411738807860353) +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 (9473589826959436958,14264235017873782379) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 2 (261441656340606110,13387826928927239258) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 2 (261441656340606110,3305790294064680121) +ngramMinhashCaseInsensitive +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 (15568933215262012353,16287411738807860353) +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 (9473589826959436958,14264235017873782379) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 2 (3051755284325985438,3305790294064680121) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 2 (3051755284325985438,13387826928927239258) +ngramMinhashUTF8 +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 4 (309830857064065611,7476109060377919216) +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 (309830856946430871,7521913981442105351) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 (309830857559697399,7476109060377919216) +ngramMinhashCaseInsensitiveUTF8 +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 (13010809262502929096,2266175201446733829) +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 5 (16827851229372179144,976408052548769549) +wordShingleMinhash +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 2 (14343822344862533053,11776483993821900250) +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 (18417749332128868312,11776483993821900250) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 (18417749329907528200,14156831980621923226) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 (4600092690178227586,11776483993821900250) +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 (12998011837685887081,1565093152297016105) +wordShingleMinhashCaseInsensitive +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 (12998011837880940480,1565093152297016105) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 (1100751419997894255,15225006848401474458) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 2 (1100751419777226283,12993805708561478711) +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 (1260401089202135898,12993805709529540523) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 (1638964264353944555,12993805708561478711) +wordShingleMinhashUTF8 +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 (742280067319112377,14237963017046410351) +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 (7237654052534217600,14400297883226437452) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 (742280067319112377,17574811665615962276) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 3 (3458625375707825328,17574811665615962276) +wordShingleMinhashCaseInsensitiveUTF8 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 (7032848390598450936,5104668712725998486) +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 (15582670464629505464,13034678298246801511) +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 (9935434838523508980,7648038926638343017) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 2 (7032848390598450936,16870743692447971238) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 (7302041809563941951,6856814412450461959) diff --git a/tests/queries/0_stateless/01016_simhash_minhash.sql b/tests/queries/0_stateless/01016_simhash_minhash.sql new file mode 100644 index 00000000000..61b9ac14259 --- /dev/null +++ b/tests/queries/0_stateless/01016_simhash_minhash.sql @@ -0,0 +1,111 @@ +SELECT ngramSimhash(''); +SELECT ngramSimhash('what a cute cat.'); +SELECT ngramSimhashCaseInsensitive('what a cute cat.'); +SELECT ngramSimhashUTF8('what a cute cat.'); +SELECT ngramSimhashCaseInsensitiveUTF8('what a cute cat.'); +SELECT wordShingleSimhash('what a cute cat.'); +SELECT wordShingleSimhashCaseInsensitive('what a cute cat.'); +SELECT wordShingleSimhashUTF8('what a cute cat.'); +SELECT wordShingleSimhashCaseInsensitiveUTF8('what a cute cat.'); + +SELECT ngramMinhash(''); +SELECT ngramMinhash('what a cute cat.'); +SELECT ngramMinhashCaseInsensitive('what a cute cat.'); +SELECT ngramMinhashUTF8('what a cute cat.'); +SELECT ngramMinhashCaseInsensitiveUTF8('what a cute cat.'); +SELECT wordShingleMinhash('what a cute cat.'); +SELECT wordShingleMinhashCaseInsensitive('what a cute cat.'); +SELECT wordShingleMinhashUTF8('what a cute cat.'); +SELECT wordShingleMinhashCaseInsensitiveUTF8('what a cute cat.'); + +DROP TABLE IF EXISTS defaults; +CREATE TABLE defaults +( + s String +)ENGINE = Memory(); + +INSERT INTO defaults values ('It is the latest occurrence of the Southeast European haze, the issue that occurs in constant intensity during every wet season. It has mainly been caused by forest fires resulting from illegal slash-and-burn clearing performed on behalf of the palm oil industry in Kazakhstan, principally on the islands, which then spread quickly in the dry season.') ('It is the latest occurrence of the Southeast Asian haze, the issue that occurs in constant intensity during every wet season. It has mainly been caused by forest fires resulting from illegal slash-and-burn clearing performed on behalf of the palm oil industry in Kazakhstan, principally on the islands, which then spread quickly in the dry season.'); + +SELECT ngramSimhash(s) FROM defaults; +SELECT ngramSimhashCaseInsensitive(s) FROM defaults; +SELECT ngramSimhashUTF8(s) FROM defaults; +SELECT ngramSimhashCaseInsensitiveUTF8(s) FROM defaults; +SELECT wordShingleSimhash(s) FROM defaults; +SELECT wordShingleSimhashCaseInsensitive(s) FROM defaults; +SELECT wordShingleSimhashUTF8(s) FROM defaults; +SELECT wordShingleSimhashCaseInsensitiveUTF8(s) FROM defaults; + +SELECT ngramMinhash(s) FROM defaults; +SELECT ngramMinhashCaseInsensitive(s) FROM defaults; +SELECT ngramMinhashUTF8(s) FROM defaults; +SELECT ngramMinhashCaseInsensitiveUTF8(s) FROM defaults; +SELECT wordShingleMinhash(s) FROM defaults; +SELECT wordShingleMinhashCaseInsensitive(s) FROM defaults; +SELECT wordShingleMinhashUTF8(s) FROM defaults; +SELECT wordShingleMinhashCaseInsensitiveUTF8(s) FROM defaults; + +TRUNCATE TABLE defaults; +INSERT INTO defaults SELECT arrayJoin(splitByString('\n\n', +'ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency. +ClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes. +ClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. + +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency. +ClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system''s read and write availability. +ClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. + +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency. +ClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system''s read / write availability. +ClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. + +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency. +ClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system. +ClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. + +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency. +ClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system. +ClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. + +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency. +ClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system. +ClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.' +)); + +SELECT 'uniqExact', uniqExact(s) FROM defaults; + + +SELECT 'ngramSimhash'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), ngramSimhash(s) as h FROM defaults GROUP BY h; +SELECT 'ngramSimhashCaseInsensitive'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), ngramSimhashCaseInsensitive(s) as h FROM defaults GROUP BY h; +SELECT 'ngramSimhashUTF8'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), ngramSimhashUTF8(s) as h FROM defaults GROUP BY h; +SELECT 'ngramSimhashCaseInsensitiveUTF8'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), ngramSimhashCaseInsensitiveUTF8(s) as h FROM defaults GROUP BY h; +SELECT 'wordShingleSimhash'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), wordShingleSimhash(s) as h FROM defaults GROUP BY h; +SELECT 'wordShingleSimhashCaseInsensitive'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), wordShingleSimhashCaseInsensitive(s) as h FROM defaults GROUP BY h; +SELECT 'wordShingleSimhashUTF8'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), wordShingleSimhashUTF8(s) as h FROM defaults GROUP BY h; +SELECT 'wordShingleSimhashCaseInsensitiveUTF8'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), wordShingleSimhashCaseInsensitiveUTF8(s) as h FROM defaults GROUP BY h; + +SELECT 'ngramMinhash'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), ngramMinhash(s) as h FROM defaults GROUP BY h; +SELECT 'ngramMinhashCaseInsensitive'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), ngramMinhashCaseInsensitive(s) as h FROM defaults GROUP BY h; +SELECT 'ngramMinhashUTF8'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), ngramMinhashUTF8(s) as h FROM defaults GROUP BY h; +SELECT 'ngramMinhashCaseInsensitiveUTF8'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), ngramMinhashCaseInsensitiveUTF8(s) as h FROM defaults GROUP BY h; +SELECT 'wordShingleMinhash'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), wordShingleMinhash(s) as h FROM defaults GROUP BY h; +SELECT 'wordShingleMinhashCaseInsensitive'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), wordShingleMinhashCaseInsensitive(s) as h FROM defaults GROUP BY h; +SELECT 'wordShingleMinhashUTF8'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), wordShingleMinhashUTF8(s) as h FROM defaults GROUP BY h; +SELECT 'wordShingleMinhashCaseInsensitiveUTF8'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), wordShingleMinhashCaseInsensitiveUTF8(s) as h FROM defaults GROUP BY h; + +DROP TABLE defaults; diff --git a/tests/queries/0_stateless/01017_bithamming_distance.reference b/tests/queries/0_stateless/01017_bithamming_distance.reference new file mode 100644 index 00000000000..cc2d4f39154 --- /dev/null +++ b/tests/queries/0_stateless/01017_bithamming_distance.reference @@ -0,0 +1,15 @@ +1 +7 +63 +2 +1 +3 +5 +4 +6 +6 +6 +3 +5 +9 +9 diff --git a/tests/queries/0_stateless/01017_bithamming_distance.sql b/tests/queries/0_stateless/01017_bithamming_distance.sql new file mode 100644 index 00000000000..4b36894b97c --- /dev/null +++ b/tests/queries/0_stateless/01017_bithamming_distance.sql @@ -0,0 +1,20 @@ +SELECT bitHammingDistance(1, 5); +SELECT bitHammingDistance(100, 100000); +SELECT bitHammingDistance(-1, 1); + +DROP TABLE IF EXISTS defaults; +CREATE TABLE defaults +( + n1 UInt8, + n2 UInt16, + n3 UInt32, + n4 UInt64 +)ENGINE = Memory(); + +INSERT INTO defaults VALUES (1, 2, 3, 4) (12, 4345, 435, 1233) (45, 675, 32343, 54566) (90, 784, 9034, 778752); + +SELECT bitHammingDistance(4, n1) FROM defaults; +SELECT bitHammingDistance(n2, 100) FROM defaults; +SELECT bitHammingDistance(n3, n4) FROM defaults; + +DROP TABLE defaults; diff --git a/tests/queries/0_stateless/01017_tuplehamming_distance.reference b/tests/queries/0_stateless/01017_tuplehamming_distance.reference new file mode 100644 index 00000000000..017ffb0cd33 --- /dev/null +++ b/tests/queries/0_stateless/01017_tuplehamming_distance.reference @@ -0,0 +1,15 @@ +2 +1 +1 +0 +2 +2 +2 +2 +1 +2 +2 +2 +0 +2 +2 diff --git a/tests/queries/0_stateless/01017_tuplehamming_distance.sql b/tests/queries/0_stateless/01017_tuplehamming_distance.sql new file mode 100644 index 00000000000..d0ed1cee096 --- /dev/null +++ b/tests/queries/0_stateless/01017_tuplehamming_distance.sql @@ -0,0 +1,19 @@ +SELECT tupleHammingDistance((1, 2), (3, 4)); +SELECT tupleHammingDistance((120, 243), (120, 434)); +SELECT tupleHammingDistance((-12, 434), (434, 434)); + +DROP TABLE IF EXISTS defaults; +CREATE TABLE defaults +( + t1 Tuple(UInt16, UInt16), + t2 Tuple(UInt32, UInt32), + t3 Tuple(Int64, Int64) +)ENGINE = Memory(); + +INSERT INTO defaults VALUES ((12, 43), (12312, 43453) ,(-10, 32)) ((1, 4), (546, 12345), (546, 12345)) ((90, 9875), (43456, 234203), (1231, -123)) ((87, 987), (545645, 768354634), (9123, 909)); + +SELECT tupleHammingDistance((12, 43), t1) FROM defaults; +SELECT tupleHammingDistance(t2, (546, 456)) FROM defaults; +SELECT tupleHammingDistance(t2, t3) FROM defaults; + +DROP TABLE defaults; diff --git a/tests/queries/0_stateless/01051_system_stack_trace.reference b/tests/queries/0_stateless/01051_system_stack_trace.reference new file mode 100644 index 00000000000..d00491fd7e5 --- /dev/null +++ b/tests/queries/0_stateless/01051_system_stack_trace.reference @@ -0,0 +1 @@ +1 diff --git a/tests/queries/0_stateless/01051_system_stack_trace.sql b/tests/queries/0_stateless/01051_system_stack_trace.sql new file mode 100644 index 00000000000..32d344fce7e --- /dev/null +++ b/tests/queries/0_stateless/01051_system_stack_trace.sql @@ -0,0 +1,2 @@ +-- at least this query should be present +SELECT count() > 0 FROM system.stack_trace WHERE query_id != ''; diff --git a/tests/queries/0_stateless/01244_optimize_distributed_group_by_sharding_key.sql b/tests/queries/0_stateless/01244_optimize_distributed_group_by_sharding_key.sql index 9bc50ae2fc7..d152326091b 100644 --- a/tests/queries/0_stateless/01244_optimize_distributed_group_by_sharding_key.sql +++ b/tests/queries/0_stateless/01244_optimize_distributed_group_by_sharding_key.sql @@ -68,7 +68,7 @@ select 'OFFSET'; select count(), * from dist_01247 group by number offset 1; -- this will emulate different data on for different shards select 'WHERE LIMIT OFFSET'; -select count(), * from dist_01247 where number = _shard_num-1 group by number limit 1 offset 1; +select count(), * from dist_01247 where number = _shard_num-1 group by number order by number limit 1 offset 1; select 'LIMIT BY 1'; select count(), * from dist_01247 group by number order by number limit 1 by number; diff --git a/tests/queries/0_stateless/01415_sticking_mutations.sh b/tests/queries/0_stateless/01415_sticking_mutations.sh index 9ae1ef03d03..ce34cd09ca3 100755 --- a/tests/queries/0_stateless/01415_sticking_mutations.sh +++ b/tests/queries/0_stateless/01415_sticking_mutations.sh @@ -43,7 +43,7 @@ function check_sticky_mutations() $CLICKHOUSE_CLIENT --query "SYSTEM START MERGES sticking_mutations" - # just to be sure, that previous mutations finished + # Just to be sure, that previous mutations finished $CLICKHOUSE_CLIENT --query "ALTER TABLE sticking_mutations DELETE WHERE value2 % 31 == 0 SETTINGS mutations_sync = 1" $CLICKHOUSE_CLIENT --query "OPTIMIZE TABLE sticking_mutations FINAL" diff --git a/tests/queries/0_stateless/01591_window_functions.reference b/tests/queries/0_stateless/01591_window_functions.reference new file mode 100644 index 00000000000..6c78108d734 --- /dev/null +++ b/tests/queries/0_stateless/01591_window_functions.reference @@ -0,0 +1,149 @@ +set allow_experimental_window_functions = 1; + +-- just something basic + +select number, count() over (partition by intDiv(number, 3) order by number) from numbers(10); + +-- proper calculation across blocks + +0 1 +1 2 +2 3 +3 1 +4 2 +5 3 +6 1 +7 2 +8 3 +9 1 +select number, max(number) over (partition by intDiv(number, 3) order by number desc) from numbers(10) settings max_block_size = 2; + +-- not a window function + +2 2 +1 2 +0 2 +5 5 +4 5 +3 5 +8 8 +7 8 +6 8 +9 9 +select number, abs(number) over (partition by toString(intDiv(number, 3))) from numbers(10); -- { serverError 63 } + +-- no partition by + +select number, avg(number) over (order by number) from numbers(10); + +-- no order by + +0 0 +1 0.5 +2 1 +3 1.5 +4 2 +5 2.5 +6 3 +7 3.5 +8 4 +9 4.5 +select number, quantileExact(number) over (partition by intDiv(number, 3)) from numbers(10); + +-- can add an alias after window spec + +0 0 +1 1 +2 1 +3 3 +4 4 +5 4 +6 6 +7 7 +8 7 +9 9 +select number, quantileExact(number) over (partition by intDiv(number, 3)) q from numbers(10); + +-- can't reference it yet -- the window functions are calculated at the +-- last stage of select, after all other functions. + +0 0 +1 1 +2 1 +3 3 +4 4 +5 4 +6 6 +7 7 +8 7 +9 9 +select q * 10, quantileExact(number) over (partition by intDiv(number, 3)) q from numbers(10); -- { serverError 47 } + +-- should work in ORDER BY though + +select number, max(number) over (partition by intDiv(number, 3) order by number desc) m from numbers(10) order by m desc, number; + +-- this one doesn't work yet -- looks like the column names clash, and the +-- window count() is overwritten with aggregate count() +-- select number, count(), count() over (partition by intDiv(number, 3)) from numbers(10) group by number order by count() desc; + +-- different windows +-- an explain test would also be helpful, but it's too immature now and I don't +-- want to change reference all the time + +9 9 +6 8 +7 8 +8 8 +3 5 +4 5 +5 5 +0 2 +1 2 +2 2 +select number, max(number) over (partition by intDiv(number, 3) order by number desc), count(number) over (partition by intDiv(number, 5) order by number) as m from numbers(31) order by number settings max_block_size = 2; + +-- two functions over the same window +-- an explain test would also be helpful, but it's too immature now and I don't +-- want to change reference all the time + +0 2 1 +1 2 2 +2 2 3 +3 5 4 +4 5 5 +5 5 1 +6 8 2 +7 8 3 +8 8 4 +9 11 5 +10 11 1 +11 11 2 +12 14 3 +13 14 4 +14 14 5 +15 17 1 +16 17 2 +17 17 3 +18 20 4 +19 20 5 +20 20 1 +21 23 2 +22 23 3 +23 23 4 +24 26 5 +25 26 1 +26 26 2 +27 29 3 +28 29 4 +29 29 5 +30 30 1 +select number, max(number) over (partition by intDiv(number, 3) order by number desc), count(number) over (partition by intDiv(number, 3) order by number desc) as m from numbers(7) order by number settings max_block_size = 2; + +0 2 3 +1 2 2 +2 2 1 +3 5 3 +4 5 2 +5 5 1 +6 6 1 diff --git a/tests/queries/0_stateless/01591_window_functions.sql b/tests/queries/0_stateless/01591_window_functions.sql new file mode 100644 index 00000000000..a28d435d3f8 --- /dev/null +++ b/tests/queries/0_stateless/01591_window_functions.sql @@ -0,0 +1,42 @@ +-- { echo } + +set allow_experimental_window_functions = 1; + +-- just something basic +select number, count() over (partition by intDiv(number, 3) order by number) from numbers(10); + +-- proper calculation across blocks +select number, max(number) over (partition by intDiv(number, 3) order by number desc) from numbers(10) settings max_block_size = 2; + +-- not a window function +select number, abs(number) over (partition by toString(intDiv(number, 3))) from numbers(10); -- { serverError 63 } + +-- no partition by +select number, avg(number) over (order by number) from numbers(10); + +-- no order by +select number, quantileExact(number) over (partition by intDiv(number, 3)) from numbers(10); + +-- can add an alias after window spec +select number, quantileExact(number) over (partition by intDiv(number, 3)) q from numbers(10); + +-- can't reference it yet -- the window functions are calculated at the +-- last stage of select, after all other functions. +select q * 10, quantileExact(number) over (partition by intDiv(number, 3)) q from numbers(10); -- { serverError 47 } + +-- should work in ORDER BY though +select number, max(number) over (partition by intDiv(number, 3) order by number desc) m from numbers(10) order by m desc, number; + +-- this one doesn't work yet -- looks like the column names clash, and the +-- window count() is overwritten with aggregate count() +-- select number, count(), count() over (partition by intDiv(number, 3)) from numbers(10) group by number order by count() desc; + +-- different windows +-- an explain test would also be helpful, but it's too immature now and I don't +-- want to change reference all the time +select number, max(number) over (partition by intDiv(number, 3) order by number desc), count(number) over (partition by intDiv(number, 5) order by number) as m from numbers(31) order by number settings max_block_size = 2; + +-- two functions over the same window +-- an explain test would also be helpful, but it's too immature now and I don't +-- want to change reference all the time +select number, max(number) over (partition by intDiv(number, 3) order by number desc), count(number) over (partition by intDiv(number, 3) order by number desc) as m from numbers(7) order by number settings max_block_size = 2; diff --git a/tests/server-test.xml b/tests/server-test.xml new file mode 100644 index 00000000000..0b5e8f760a8 --- /dev/null +++ b/tests/server-test.xml @@ -0,0 +1,143 @@ + + + + + trace + /tmp/clickhouse/log/clickhouse-server.log + /tmp/clickhouse/log/clickhouse-server.err.log + 10M + 1 + 0 + + :: + 0.0.0.0 + 1 + 58123 + 59000 + 58443 + 59440 + 59009 + 10000 + + + + /tmp/clickhouse/etc/server.crt + /tmp/clickhouse/etc/server.key + + /tmp/clickhouse/etc/dhparam.pem + none + true + true + sslv2,sslv3 + true + + + + true + true + sslv2,sslv3 + true + none + + AcceptCertificateHandler + + + + + 3 + /tmp/clickhouse/data/ + /tmp/clickhouse/tmp/ + users.xml + /tmp/clickhouse/data/access/ + custom_ + 5368709120 + default + default + Europe/Moscow + + + + + + localhost + 59000 + + + + + localhost + 1 + + + + + + + localhost + 59000 + + + + + + + 127.0.0.1 + 59000 + + + + + 127.0.0.2 + 59000 + + + + + + + localhost + 59000 + + + + + localhost + 59000 + + + + + + + localhost + 59440 + 1 + + + + + + + + 3600 + 3600 + 60 + + system + query_log
+ 7500 +
+ *_dictionary.xml + + + + /clickhouse/task_queue/ddl + + /tmp/clickhouse/data/format_schemas/ + + + TOPSECRET.TOPSECRET + [hidden] + + +
\ No newline at end of file