Merge remote-tracking branch 'upstream/master' into HEAD

This commit is contained in:
Anton Popov 2020-12-23 15:16:43 +03:00
commit 57857dda63
122 changed files with 5219 additions and 672 deletions

View File

@ -61,6 +61,16 @@
# endif
#endif
#if defined(ADDRESS_SANITIZER)
# define BOOST_USE_ASAN 1
# define BOOST_USE_UCONTEXT 1
#endif
#if defined(THREAD_SANITIZER)
# define BOOST_USE_TSAN 1
# define BOOST_USE_UCONTEXT 1
#endif
/// TODO: Strange enough, there is no way to detect UB sanitizer.
/// Explicitly allow undefined behaviour for certain functions. Use it as a function attribute.

View File

@ -4,6 +4,11 @@
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/wait.h>
#include <sys/resource.h>
#if defined(__linux__)
#include <sys/prctl.h>
#endif
#include <fcntl.h>
#include <errno.h>
#include <string.h>
@ -12,7 +17,6 @@
#include <unistd.h>
#include <typeinfo>
#include <sys/resource.h>
#include <iostream>
#include <fstream>
#include <sstream>
@ -22,7 +26,6 @@
#include <Poco/Observer.h>
#include <Poco/AutoPtr.h>
#include <Poco/PatternFormatter.h>
#include <Poco/TaskManager.h>
#include <Poco/File.h>
#include <Poco/Path.h>
#include <Poco/Message.h>
@ -470,7 +473,6 @@ BaseDaemon::~BaseDaemon()
void BaseDaemon::terminate()
{
getTaskManager().cancelAll();
if (::raise(SIGTERM) != 0)
throw Poco::SystemException("cannot terminate process");
}
@ -478,22 +480,11 @@ void BaseDaemon::terminate()
void BaseDaemon::kill()
{
dumpCoverageReportIfPossible();
pid.reset();
pid_file.reset();
if (::raise(SIGKILL) != 0)
throw Poco::SystemException("cannot kill process");
}
void BaseDaemon::sleep(double seconds)
{
wakeup_event.reset();
wakeup_event.tryWait(seconds * 1000);
}
void BaseDaemon::wakeup()
{
wakeup_event.set();
}
std::string BaseDaemon::getDefaultCorePath() const
{
return "/opt/cores/";
@ -564,7 +555,6 @@ void BaseDaemon::initialize(Application & self)
{
closeFDs();
task_manager = std::make_unique<Poco::TaskManager>();
ServerApplication::initialize(self);
/// now highest priority (lowest value) is PRIO_APPLICATION = -100, we want higher!
@ -648,10 +638,6 @@ void BaseDaemon::initialize(Application & self)
throw Poco::OpenFileException("Cannot attach stdout to " + stdout_path);
}
/// Create pid file.
if (config().has("pid"))
pid.emplace(config().getString("pid"), DB::StatusFile::write_pid);
/// Change path for logging.
if (!log_path.empty())
{
@ -667,9 +653,17 @@ void BaseDaemon::initialize(Application & self)
throw Poco::Exception("Cannot change directory to /tmp");
}
// sensitive data masking rules are not used here
/// sensitive data masking rules are not used here
buildLoggers(config(), logger(), self.commandName());
/// After initialized loggers but before initialized signal handling.
if (should_setup_watchdog)
setupWatchdog();
/// Create pid file.
if (config().has("pid"))
pid_file.emplace(config().getString("pid"), DB::StatusFile::write_pid);
if (is_daemon)
{
/** Change working directory to the directory to write core dumps.
@ -704,28 +698,13 @@ void BaseDaemon::initialize(Application & self)
}
void BaseDaemon::initializeTerminationAndSignalProcessing()
{
SentryWriter::initialize(config());
std::set_terminate(terminate_handler);
/// We want to avoid SIGPIPE when working with sockets and pipes, and just handle return value/errno instead.
{
sigset_t sig_set;
if (sigemptyset(&sig_set) || sigaddset(&sig_set, SIGPIPE) || pthread_sigmask(SIG_BLOCK, &sig_set, nullptr))
throw Poco::Exception("Cannot block signal.");
}
/// Setup signal handlers.
auto add_signal_handler =
[this](const std::vector<int> & signals, signal_function handler)
static void addSignalHandler(const std::vector<int> & signals, signal_function handler, std::vector<int> * out_handled_signals)
{
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_sigaction = handler;
sa.sa_flags = SA_SIGINFO;
{
#if defined(OS_DARWIN)
sigemptyset(&sa.sa_mask);
for (auto signal : signals)
@ -743,15 +722,47 @@ void BaseDaemon::initializeTerminationAndSignalProcessing()
if (sigaction(signal, &sa, nullptr))
throw Poco::Exception("Cannot set signal handler.");
std::copy(signals.begin(), signals.end(), std::back_inserter(handled_signals));
}
if (out_handled_signals)
std::copy(signals.begin(), signals.end(), std::back_inserter(*out_handled_signals));
};
static void blockSignals(const std::vector<int> & signals)
{
sigset_t sig_set;
#if defined(OS_DARWIN)
sigemptyset(&sig_set);
for (auto signal : signals)
sigaddset(&sig_set, signal);
#else
if (sigemptyset(&sig_set))
throw Poco::Exception("Cannot block signal.");
for (auto signal : signals)
if (sigaddset(&sig_set, signal))
throw Poco::Exception("Cannot block signal.");
#endif
if (pthread_sigmask(SIG_BLOCK, &sig_set, nullptr))
throw Poco::Exception("Cannot block signal.");
};
void BaseDaemon::initializeTerminationAndSignalProcessing()
{
SentryWriter::initialize(config());
std::set_terminate(terminate_handler);
/// We want to avoid SIGPIPE when working with sockets and pipes, and just handle return value/errno instead.
blockSignals({SIGPIPE});
/// Setup signal handlers.
/// SIGTSTP is added for debugging purposes. To output a stack trace of any running thread at anytime.
add_signal_handler({SIGABRT, SIGSEGV, SIGILL, SIGBUS, SIGSYS, SIGFPE, SIGPIPE, SIGTSTP}, signalHandler);
add_signal_handler({SIGHUP, SIGUSR1}, closeLogsSignalHandler);
add_signal_handler({SIGINT, SIGQUIT, SIGTERM}, terminateRequestedSignalHandler);
addSignalHandler({SIGABRT, SIGSEGV, SIGILL, SIGBUS, SIGSYS, SIGFPE, SIGPIPE, SIGTSTP}, signalHandler, &handled_signals);
addSignalHandler({SIGHUP, SIGUSR1}, closeLogsSignalHandler, &handled_signals);
addSignalHandler({SIGINT, SIGQUIT, SIGTERM}, terminateRequestedSignalHandler, &handled_signals);
#if defined(SANITIZER)
__sanitizer_set_death_callback(sanitizerDeathCallback);
@ -786,23 +797,6 @@ void BaseDaemon::logRevision() const
+ ", PID " + std::to_string(getpid()));
}
/// Makes server shutdown if at least one Poco::Task have failed.
void BaseDaemon::exitOnTaskError()
{
Poco::Observer<BaseDaemon, Poco::TaskFailedNotification> obs(*this, &BaseDaemon::handleNotification);
getTaskManager().addObserver(obs);
}
/// Used for exitOnTaskError()
void BaseDaemon::handleNotification(Poco::TaskFailedNotification *_tfn)
{
task_failed = true;
Poco::AutoPtr<Poco::TaskFailedNotification> fn(_tfn);
Poco::Logger * lg = &(logger());
LOG_ERROR(lg, "Task '{}' failed. Daemon is shutting down. Reason - {}", fn->task()->name(), fn->reason().displayText());
ServerApplication::terminate();
}
void BaseDaemon::defineOptions(Poco::Util::OptionSet & new_options)
{
new_options.addOption(
@ -863,13 +857,144 @@ void BaseDaemon::onInterruptSignals(int signal_id)
if (sigint_signals_counter >= 2)
{
LOG_INFO(&logger(), "Received second signal Interrupt. Immediately terminate.");
kill();
call_default_signal_handler(signal_id);
/// If the above did not help.
_exit(128 + signal_id);
}
}
void BaseDaemon::waitForTerminationRequest()
{
/// NOTE: as we already process signals via pipe, we don't have to block them with sigprocmask in threads
std::unique_lock<std::mutex> lock(signal_handler_mutex);
signal_event.wait(lock, [this](){ return terminate_signals_counter > 0; });
}
void BaseDaemon::shouldSetupWatchdog(char * argv0_)
{
should_setup_watchdog = true;
argv0 = argv0_;
}
void BaseDaemon::setupWatchdog()
{
/// Initialize in advance to avoid double initialization in forked processes.
DateLUT::instance();
std::string original_process_name;
if (argv0)
original_process_name = argv0;
while (true)
{
static pid_t pid = -1;
pid = fork();
if (-1 == pid)
throw Poco::Exception("Cannot fork");
if (0 == pid)
{
logger().information("Forked a child process to watch");
#if defined(__linux__)
if (0 != prctl(PR_SET_PDEATHSIG, SIGKILL))
logger().warning("Cannot do prctl to ask termination with parent.");
#endif
return;
}
/// Change short thread name and process name.
setThreadName("clckhouse-watch"); /// 15 characters
if (argv0)
{
const char * new_process_name = "clickhouse-watchdog";
memset(argv0, 0, original_process_name.size());
memcpy(argv0, new_process_name, std::min(strlen(new_process_name), original_process_name.size()));
}
logger().information(fmt::format("Will watch for the process with pid {}", pid));
/// Forward signals to the child process.
addSignalHandler(
{SIGHUP, SIGUSR1, SIGINT, SIGQUIT, SIGTERM},
[](int sig, siginfo_t *, void *)
{
/// Forward all signals except INT as it can be send by terminal to the process group when user press Ctrl+C,
/// and we process double delivery of this signal as immediate termination.
if (sig == SIGINT)
return;
const char * error_message = "Cannot forward signal to the child process.\n";
if (0 != ::kill(pid, sig))
{
auto res = write(STDERR_FILENO, error_message, strlen(error_message));
(void)res;
}
},
nullptr);
int status = 0;
do
{
if (-1 != waitpid(pid, &status, WUNTRACED | WCONTINUED) || errno == ECHILD)
{
if (WIFSTOPPED(status))
logger().warning(fmt::format("Child process was stopped by signal {}.", WSTOPSIG(status)));
else if (WIFCONTINUED(status))
logger().warning(fmt::format("Child process was continued."));
else
break;
}
else if (errno != EINTR)
throw Poco::Exception("Cannot waitpid, errno: " + std::string(strerror(errno)));
} while (true);
if (errno == ECHILD)
{
logger().information("Child process no longer exists.");
_exit(status);
}
if (WIFEXITED(status))
{
logger().information(fmt::format("Child process exited normally with code {}.", WEXITSTATUS(status)));
_exit(status);
}
if (WIFSIGNALED(status))
{
int sig = WTERMSIG(status);
if (sig == SIGKILL)
{
logger().fatal(fmt::format("Child process was terminated by signal {} (KILL)."
" If it is not done by 'forcestop' command or manually,"
" the possible cause is OOM Killer (see 'dmesg' and look at the '/var/log/kern.log' for the details).", sig));
}
else
{
logger().fatal(fmt::format("Child process was terminated by signal {}.", sig));
if (sig == SIGINT || sig == SIGTERM || sig == SIGQUIT)
_exit(status);
}
}
else
{
logger().fatal("Child process was not exited normally by unknown reason.");
}
/// Automatic restart is not enabled but you can play with it.
#if 1
_exit(status);
#else
logger().information("Will restart.");
if (argv0)
memcpy(argv0, original_process_name.c_str(), original_process_name.size());
#endif
}
}

View File

@ -12,7 +12,6 @@
#include <chrono>
#include <Poco/Process.h>
#include <Poco/ThreadPool.h>
#include <Poco/TaskNotification.h>
#include <Poco/Util/Application.h>
#include <Poco/Util/ServerApplication.h>
#include <Poco/Net/SocketAddress.h>
@ -26,9 +25,6 @@
#include <loggers/Loggers.h>
namespace Poco { class TaskManager; }
/// \brief Base class for applications that can run as daemons.
///
/// \code
@ -52,31 +48,26 @@ public:
BaseDaemon();
~BaseDaemon() override;
/// Загружает конфигурацию и "строит" логгеры на запись в файлы
/// Load configuration, prepare loggers, etc.
void initialize(Poco::Util::Application &) override;
/// Читает конфигурацию
void reloadConfiguration();
/// Определяет параметр командной строки
/// Process command line parameters
void defineOptions(Poco::Util::OptionSet & new_options) override;
/// Заставляет демон завершаться, если хотя бы одна задача завершилась неудачно
void exitOnTaskError();
/// Graceful shutdown
static void terminate();
/// Завершение демона ("мягкое")
void terminate();
/// Завершение демона ("жёсткое")
/// Forceful shutdown
void kill();
/// Получен ли сигнал на завершение?
/// Cancellation request has been received.
bool isCancelled() const
{
return is_cancelled;
}
/// Получение ссылки на экземпляр демона
static BaseDaemon & instance()
{
return dynamic_cast<BaseDaemon &>(Poco::Util::Application::instance());
@ -85,12 +76,6 @@ public:
/// return none if daemon doesn't exist, reference to the daemon otherwise
static std::optional<std::reference_wrapper<BaseDaemon>> tryGetInstance() { return tryGetInstance<BaseDaemon>(); }
/// Спит заданное количество секунд или до события wakeup
void sleep(double seconds);
/// Разбудить
void wakeup();
/// В Graphite компоненты пути(папки) разделяются точкой.
/// У нас принят путь формата root_path.hostname_yandex_ru.key
/// root_path по умолчанию one_min
@ -131,24 +116,23 @@ public:
/// also doesn't close global internal pipes for signal handling
static void closeFDs();
/// If this method is called after initialization and before run,
/// will fork child process and setup watchdog that will print diagnostic info, if the child terminates.
/// argv0 is needed to change process name (consequently, it is needed for scripts involving "pgrep", "pidof" to work correctly).
void shouldSetupWatchdog(char * argv0_);
protected:
/// Возвращает TaskManager приложения
/// все методы task_manager следует вызывать из одного потока
/// иначе возможен deadlock, т.к. joinAll выполняется под локом, а любой метод тоже берет лок
Poco::TaskManager & getTaskManager() { return *task_manager; }
virtual void logRevision() const;
/// Используется при exitOnTaskError()
void handleNotification(Poco::TaskFailedNotification *);
/// thread safe
virtual void handleSignal(int signal_id);
/// initialize termination process and signal handlers
virtual void initializeTerminationAndSignalProcessing();
/// реализация обработки сигналов завершения через pipe не требует блокировки сигнала с помощью sigprocmask во всех потоках
/// fork the main process and watch if it was killed
void setupWatchdog();
void waitForTerminationRequest()
#if defined(POCO_CLICKHOUSE_PATCH) || POCO_VERSION >= 0x02000000 // in old upstream poco not vitrual
override
@ -162,21 +146,13 @@ protected:
virtual std::string getDefaultCorePath() const;
std::unique_ptr<Poco::TaskManager> task_manager;
std::optional<DB::StatusFile> pid;
std::optional<DB::StatusFile> pid_file;
std::atomic_bool is_cancelled{false};
/// Флаг устанавливается по сообщению из Task (при аварийном завершении).
bool task_failed = false;
bool log_to_console = false;
/// Событие, чтобы проснуться во время ожидания
Poco::Event wakeup_event;
/// Поток, в котором принимается сигнал HUP/USR1 для закрытия логов.
/// A thread that acts on HUP and USR1 signal (close logs).
Poco::Thread signal_listener_thread;
std::unique_ptr<Poco::Runnable> signal_listener;
@ -194,6 +170,9 @@ protected:
String build_id_info;
std::vector<int> handled_signals;
bool should_setup_watchdog = false;
char * argv0 = nullptr;
};

View File

@ -0,0 +1,17 @@
#include <sys/timerfd.h>
#include "syscall.h"
int timerfd_create(int clockid, int flags)
{
return syscall(SYS_timerfd_create, clockid, flags);
}
int timerfd_settime(int fd, int flags, const struct itimerspec *new, struct itimerspec *old)
{
return syscall(SYS_timerfd_settime, fd, flags, new, old);
}
int timerfd_gettime(int fd, struct itimerspec *cur)
{
return syscall(SYS_timerfd_gettime, fd, cur);
}

View File

@ -11,10 +11,11 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
iostreams
program_options
regex
context
)
if(Boost_INCLUDE_DIR AND Boost_FILESYSTEM_LIBRARY AND Boost_FILESYSTEM_LIBRARY AND
Boost_PROGRAM_OPTIONS_LIBRARY AND Boost_REGEX_LIBRARY AND Boost_SYSTEM_LIBRARY)
Boost_PROGRAM_OPTIONS_LIBRARY AND Boost_REGEX_LIBRARY AND Boost_SYSTEM_LIBRARY AND Boost_CONTEXT_LIBRARY)
set(EXTERNAL_BOOST_FOUND 1)
@ -27,18 +28,21 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
add_library (_boost_program_options INTERFACE)
add_library (_boost_regex INTERFACE)
add_library (_boost_system INTERFACE)
add_library (_boost_context INTERFACE)
target_link_libraries (_boost_filesystem INTERFACE ${Boost_FILESYSTEM_LIBRARY})
target_link_libraries (_boost_iostreams INTERFACE ${Boost_IOSTREAMS_LIBRARY})
target_link_libraries (_boost_program_options INTERFACE ${Boost_PROGRAM_OPTIONS_LIBRARY})
target_link_libraries (_boost_regex INTERFACE ${Boost_REGEX_LIBRARY})
target_link_libraries (_boost_system INTERFACE ${Boost_SYSTEM_LIBRARY})
target_link_libraries (_boost_context INTERFACE ${Boost_CONTEXT_LIBRARY})
add_library (boost::filesystem ALIAS _boost_filesystem)
add_library (boost::iostreams ALIAS _boost_iostreams)
add_library (boost::program_options ALIAS _boost_program_options)
add_library (boost::regex ALIAS _boost_regex)
add_library (boost::system ALIAS _boost_system)
add_library (boost::context ALIAS _boost_context)
else()
set(EXTERNAL_BOOST_FOUND 0)
message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system boost")
@ -142,4 +146,57 @@ if (NOT EXTERNAL_BOOST_FOUND)
add_library (_boost_system ${SRCS_SYSTEM})
add_library (boost::system ALIAS _boost_system)
target_include_directories (_boost_system PRIVATE ${LIBRARY_DIR})
# context
enable_language(ASM)
SET(ASM_OPTIONS "-x assembler-with-cpp")
if (SANITIZE AND (SANITIZE STREQUAL "address" OR SANITIZE STREQUAL "thread"))
add_compile_definitions(BOOST_USE_UCONTEXT)
if (SANITIZE STREQUAL "address")
add_compile_definitions(BOOST_USE_ASAN)
elseif (SANITIZE STREQUAL "thread")
add_compile_definitions(BOOST_USE_TSAN)
endif()
set (SRCS_CONTEXT
${LIBRARY_DIR}/libs/context/src/fiber.cpp
${LIBRARY_DIR}/libs/context/src/continuation.cpp
${LIBRARY_DIR}/libs/context/src/dummy.cpp
${LIBRARY_DIR}/libs/context/src/execution_context.cpp
${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp
)
elseif (ARCH_ARM)
set (SRCS_CONTEXT
${LIBRARY_DIR}/libs/context/src/asm/jump_arm64_aapcs_elf_gas.S
${LIBRARY_DIR}/libs/context/src/asm/make_arm64_aapcs_elf_gas.S
${LIBRARY_DIR}/libs/context/src/asm/ontop_arm64_aapcs_elf_gas.S
${LIBRARY_DIR}/libs/context/src/dummy.cpp
${LIBRARY_DIR}/libs/context/src/execution_context.cpp
${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp
)
elseif(OS_DARWIN)
set (SRCS_CONTEXT
${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_sysv_macho_gas.S
${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_sysv_macho_gas.S
${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_sysv_macho_gas.S
${LIBRARY_DIR}/libs/context/src/dummy.cpp
${LIBRARY_DIR}/libs/context/src/execution_context.cpp
${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp
)
else()
set (SRCS_CONTEXT
${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_sysv_elf_gas.S
${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_sysv_elf_gas.S
${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_sysv_elf_gas.S
${LIBRARY_DIR}/libs/context/src/dummy.cpp
${LIBRARY_DIR}/libs/context/src/execution_context.cpp
${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp
)
endif()
add_library (_boost_context ${SRCS_CONTEXT})
add_library (boost::context ALIAS _boost_context)
target_include_directories (_boost_context PRIVATE ${LIBRARY_DIR})
endif ()

View File

@ -54,6 +54,26 @@ else ()
set(CARES_SHARED ON CACHE BOOL "" FORCE)
endif ()
# Disable looking for libnsl on a platforms that has gethostbyname in glibc
#
# c-ares searching for gethostbyname in the libnsl library, however in the
# version that shipped with gRPC it doing it wrong [1], since it uses
# CHECK_LIBRARY_EXISTS(), which will return TRUE even if the function exists in
# another dependent library. The upstream already contains correct macro [2],
# but it is not included in gRPC (even upstream gRPC, not the one that is
# shipped with clickhousee).
#
# [1]: https://github.com/c-ares/c-ares/blob/e982924acee7f7313b4baa4ee5ec000c5e373c30/CMakeLists.txt#L125
# [2]: https://github.com/c-ares/c-ares/blob/44fbc813685a1fa8aa3f27fcd7544faf612d376a/CMakeLists.txt#L146
#
# And because if you by some reason have libnsl [3] installed, clickhouse will
# reject to start w/o it. While this is completelly different library.
#
# [3]: https://packages.debian.org/bullseye/libnsl2
if (NOT CMAKE_SYSTEM_NAME STREQUAL "SunOS")
set(HAVE_LIBNSL OFF CACHE BOOL "" FORCE)
endif()
# We don't want to build C# extensions.
set(gRPC_BUILD_CSHARP_EXT OFF)

2
debian/rules vendored
View File

@ -62,7 +62,7 @@ ifndef DISABLE_NINJA
NINJA=$(shell which ninja)
ifneq ($(NINJA),)
CMAKE_FLAGS += -GNinja
export MAKE=$(NINJA)
export MAKE=$(NINJA) $(NINJA_FLAGS)
endif
endif

View File

@ -21,6 +21,7 @@ RUN apt-get update \
libboost-thread-dev \
libboost-iostreams-dev \
libboost-regex-dev \
libboost-context-dev \
zlib1g-dev \
liblz4-dev \
libdouble-conversion-dev \

View File

@ -12,7 +12,32 @@ dpkg -i package_folder/clickhouse-test_*.deb
# install test configs
/usr/share/clickhouse-test/config/install.sh
# For flaky check we also enable thread fuzzer
if [ "$NUM_TRIES" -gt "1" ]; then
export THREAD_FUZZER_CPU_TIME_PERIOD_US=1000
export THREAD_FUZZER_SLEEP_PROBABILITY=0.1
export THREAD_FUZZER_SLEEP_TIME_US=100000
export THREAD_FUZZER_pthread_mutex_lock_BEFORE_MIGRATE_PROBABILITY=1
export THREAD_FUZZER_pthread_mutex_lock_AFTER_MIGRATE_PROBABILITY=1
export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_MIGRATE_PROBABILITY=1
export THREAD_FUZZER_pthread_mutex_unlock_AFTER_MIGRATE_PROBABILITY=1
export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_PROBABILITY=0.001
export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_PROBABILITY=0.001
export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_PROBABILITY=0.001
export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_PROBABILITY=0.001
export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_TIME_US=10000
export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_TIME_US=10000
export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_TIME_US=10000
export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_TIME_US=10000
# simpliest way to forward env variables to server
sudo -E -u clickhouse /usr/bin/clickhouse-server --config /etc/clickhouse-server/config.xml --daemon
sleep 5
else
service clickhouse-server start && sleep 5
fi
if grep -q -- "--use-skip-list" /usr/bin/clickhouse-test; then
SKIP_LIST_OPT="--use-skip-list"

View File

@ -1,78 +0,0 @@
#!/usr/bin/env python3
import subprocess
import requests
import os
import time
FNAME_START = "+++"
CLOUDFLARE_URL = "https://api.cloudflare.com/client/v4/zones/4fc6fb1d46e87851605aa7fa69ca6fe0/purge_cache"
# we have changes in revision and commit sha on all pages
# so such changes have to be ignored
MIN_CHANGED_WORDS = 4
def collect_changed_files():
proc = subprocess.Popen("git diff HEAD~1 --word-diff=porcelain | grep -e '^+[^+]\|^\-[^\-]\|^\+\+\+'", stdout=subprocess.PIPE, shell=True)
changed_files = []
current_file_name = ""
changed_words = []
while True:
line = proc.stdout.readline().decode("utf-8").strip()
if not line:
break
if FNAME_START in line:
if changed_words:
if len(changed_words) > MIN_CHANGED_WORDS:
changed_files.append(current_file_name)
changed_words = []
current_file_name = line[6:]
else:
changed_words.append(line)
return changed_files
def filter_and_transform_changed_files(changed_files, base_domain):
result = []
for f in changed_files:
if f.endswith(".html"):
result.append(base_domain + f.replace("index.html", ""))
return result
def convert_to_dicts(changed_files, batch_size):
result = []
current_batch = {"files": []}
for f in changed_files:
if len(current_batch["files"]) >= batch_size:
result.append(current_batch)
current_batch = {"files": []}
current_batch["files"].append(f)
if current_batch["files"]:
result.append(current_batch)
return result
def post_data(prepared_batches, token):
headers = {"Authorization": "Bearer {}".format(token)}
for batch in prepared_batches:
print(("Pugring cache for", ", ".join(batch["files"])))
response = requests.post(CLOUDFLARE_URL, json=batch, headers=headers)
response.raise_for_status()
time.sleep(3)
if __name__ == "__main__":
token = os.getenv("CLOUDFLARE_TOKEN")
if not token:
raise Exception("Env variable CLOUDFLARE_TOKEN is empty")
base_domain = os.getenv("BASE_DOMAIN", "https://content.clickhouse.tech/")
changed_files = collect_changed_files()
print(("Found", len(changed_files), "changed files"))
filtered_files = filter_and_transform_changed_files(changed_files, base_domain)
print(("Files rest after filtering", len(filtered_files)))
prepared_batches = convert_to_dicts(filtered_files, 25)
post_data(prepared_batches, token)

View File

@ -32,12 +32,14 @@ then
git add ".nojekyll"
# Push to GitHub rewriting the existing contents.
git commit -a -m "Add new release at $(date)"
git commit --quiet -m "Add new release at $(date)"
git push --force origin master
if [[ ! -z "${CLOUDFLARE_TOKEN}" ]]
then
sleep 1m
python3 "${BASE_DIR}/purge_cache_for_changed_files.py"
# https://api.cloudflare.com/#zone-purge-files-by-cache-tags,-host-or-prefix
POST_DATA='{"hosts":"clickhouse.tech"}'
curl -X POST "https://api.cloudflare.com/client/v4/zones/4fc6fb1d46e87851605aa7fa69ca6fe0/purge_cache" -H "Authorization: Bearer ${CLOUDFLARE_TOKEN}" -H "Content-Type:application/json" --data "${POST_DATA}"
fi
fi

View File

@ -7,9 +7,9 @@ toc_title: DateTime64
# Datetime64 {#data_type-datetime64}
允许存储时间instant间可以表示为日历日期和一天中的时间,具有定义的亚秒精度
此类型允许以日期date加时间time的形式来存储一个时刻的时间值,具有定义的亚秒精度
刻度尺寸精度10<sup>-精度</sup>
时间刻度大小精度10<sup>-精度</sup>
语法:
@ -17,11 +17,11 @@ toc_title: DateTime64
DateTime64(precision, [timezone])
```
在内部,存储数据作为一些 ticks 自纪元开始(1970-01-01 00:00:00UTC)作为Int64. 刻度分辨率由precision参数确定。 此外,该 `DateTime64` 类型可以存储时区是相同的整个列,影响如何的值 `DateTime64` 类型值以文本格式显示,以及如何解析指定为字符串的值 (2020-01-01 05:00:01.000). 时区不存储在表的行中或resultset中而是存储在列元数据中。 查看详细信息 [日期时间](datetime.md).
在内部,此类型以Int64类型将数据存储为自Linux纪元开始(1970-01-01 00:00:00UTC)的时间刻度数ticks。时间刻度的分辨率由precision参数确定。此外`DateTime64` 类型可以像存储其他数据列一样存储时区信息,时区会影响 `DateTime64` 类型的值如何以文本格式显示,以及如何解析以字符串形式指定的时间数据 (2020-01-01 05:00:01.000)。时区不存储在表的行中也不在resultset中而是存储在列的元数据中。详细信息请参考 [DateTime](datetime.md) 数据类型.
## 例 {#examples}
## 例 {#examples}
**1.** 创建一个`DateTime64`-输入列并将数据插入其中:
**1.** 创建一个具有 `DateTime64` 类型列的表,并向其中插入数据:
``` sql
CREATE TABLE dt
@ -47,10 +47,10 @@ SELECT * FROM dt
└─────────────────────────┴──────────┘
```
- 将日期时间作为整数插入时将其视为适当缩放的Unix时间戳(UTC)。 `1546300800000` 精度为3表示 `'2019-01-01 00:00:00'` UTC. 然而,作为 `timestamp` 列有 `Europe/Moscow` UTC+3指定的时区当输出为字符串时该值将显示为 `'2019-01-01 03:00:00'`
- 当插入字符串值作为日期时间时,它被视为处于列时区。 `'2019-01-01 00:00:00'` 将被视为 `Europe/Moscow` 时区并存储为 `1546290000000`.
- 将日期时间作为integer类型插入时它会被视为适当缩放的Unix时间戳(UTC)。`1546300800000` 精度为3表示 `'2019-01-01 00:00:00'` UTC. 不过,因为 `timestamp` 列指定了 `Europe/Moscow` UTC+3的时区当作为字符串输出时将显示为 `'2019-01-01 03:00:00'`
- 当把字符串作为日期时间插入时,它会被赋予时区信息。 `'2019-01-01 00:00:00'` 将被认为处于 `Europe/Moscow` 时区并被存储为 `1546290000000`.
**2.** 过滤 `DateTime64`
**2.** 过滤 `DateTime64` 类型的
``` sql
SELECT * FROM dt WHERE timestamp = toDateTime64('2019-01-01 00:00:00', 3, 'Europe/Moscow')
@ -62,9 +62,9 @@ SELECT * FROM dt WHERE timestamp = toDateTime64('2019-01-01 00:00:00', 3, 'Europ
└─────────────────────────┴──────────┘
```
不像 `DateTime`, `DateTime64` 值不转换为 `String` 自动
`DateTime` 不同, `DateTime64` 类型的值不会自动从 `String` 类型的值转换过来
**3.** 获取一个时区 `DateTime64`-类型值:
**3.** 获取 `DateTime64` 类型值的时区信息:
``` sql
SELECT toDateTime64(now(), 3, 'Europe/Moscow') AS column, toTypeName(column) AS x
@ -97,8 +97,9 @@ FROM dt
- [类型转换函数](../../sql-reference/functions/type-conversion-functions.md)
- [用于处理日期和时间的函数](../../sql-reference/functions/date-time-functions.md)
- [用于处理数组的函数](../../sql-reference/functions/array-functions.md)
- [该 `date_time_input_format` 设置](../../operations/settings/settings.md#settings-date_time_input_format)
- [该 `timezone` 服务器配置参数](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone)
- [使用日期和时间的操作员](../../sql-reference/operators/index.md#operators-datetime)
- [`date_time_input_format` 配置](../../operations/settings/settings.md#settings-date_time_input_format)
- [`date_time_output_format` 配置](../../operations/settings/settings.md#settings-date_time_output_format)
- [`timezone` 服务器配置参数](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone)
- [用于处理日期和时间的算子](../../sql-reference/operators/index.md#operators-datetime)
- [`Date` 数据类型](date.md)
- [`DateTime` 数据类型](datetime.md)

View File

@ -949,6 +949,11 @@ private:
TestHint test_hint(test_mode, all_queries_text);
if (test_hint.clientError() || test_hint.serverError())
processTextAsSingleQuery("SET send_logs_level = 'none'");
// Echo all queries if asked; makes for a more readable reference
// file.
if (test_hint.echoQueries())
echo_queries = true;
}
/// Several queries separated by ';'.

View File

@ -14,6 +14,7 @@
#include <Parsers/ASTIdentifier.h>
#include <Parsers/ASTInsertQuery.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/ASTOrderByElement.h>
#include <Parsers/ASTQueryWithOutput.h>
#include <Parsers/ASTSelectQuery.h>
#include <Parsers/ASTSelectWithUnionQuery.h>
@ -28,6 +29,11 @@
namespace DB
{
namespace ErrorCodes
{
extern const int TOO_DEEP_RECURSION;
}
Field QueryFuzzer::getRandomField(int type)
{
switch (type)
@ -205,14 +211,88 @@ void QueryFuzzer::replaceWithTableLike(ASTPtr & ast)
ast = new_ast;
}
void QueryFuzzer::fuzzColumnLikeExpressionList(ASTPtr ast)
void QueryFuzzer::fuzzOrderByElement(ASTOrderByElement * elem)
{
switch (fuzz_rand() % 10)
{
case 0:
elem->direction = -1;
break;
case 1:
elem->direction = 1;
break;
case 2:
elem->nulls_direction = -1;
elem->nulls_direction_was_explicitly_specified = true;
break;
case 3:
elem->nulls_direction = 1;
elem->nulls_direction_was_explicitly_specified = true;
break;
case 4:
elem->nulls_direction = elem->direction;
elem->nulls_direction_was_explicitly_specified = false;
break;
default:
// do nothing
break;
}
}
void QueryFuzzer::fuzzOrderByList(IAST * ast)
{
if (!ast)
{
return;
}
auto * impl = assert_cast<ASTExpressionList *>(ast.get());
auto * list = assert_cast<ASTExpressionList *>(ast);
// Remove element
if (fuzz_rand() % 50 == 0 && list->children.size() > 1)
{
// Don't remove last element -- this leads to questionable
// constructs such as empty select.
list->children.erase(list->children.begin()
+ fuzz_rand() % list->children.size());
}
// Add element
if (fuzz_rand() % 50 == 0)
{
auto pos = list->children.empty()
? list->children.begin()
: list->children.begin() + fuzz_rand() % list->children.size();
auto col = getRandomColumnLike();
if (col)
{
auto elem = std::make_shared<ASTOrderByElement>();
elem->children.push_back(col);
elem->direction = 1;
elem->nulls_direction = 1;
elem->nulls_direction_was_explicitly_specified = false;
elem->with_fill = false;
list->children.insert(pos, elem);
}
else
{
fprintf(stderr, "no random col!\n");
}
}
// We don't have to recurse here to fuzz the children, this is handled by
// the generic recursion into IAST.children.
}
void QueryFuzzer::fuzzColumnLikeExpressionList(IAST * ast)
{
if (!ast)
{
return;
}
auto * impl = assert_cast<ASTExpressionList *>(ast);
// Remove element
if (fuzz_rand() % 50 == 0 && impl->children.size() > 1)
@ -252,11 +332,44 @@ void QueryFuzzer::fuzz(ASTs & asts)
}
}
struct ScopedIncrement
{
size_t & counter;
explicit ScopedIncrement(size_t & counter_) : counter(counter_) { ++counter; }
~ScopedIncrement() { --counter; }
};
void QueryFuzzer::fuzz(ASTPtr & ast)
{
if (!ast)
return;
// Check for exceeding max depth.
ScopedIncrement depth_increment(current_ast_depth);
if (current_ast_depth > 500)
{
// The AST is too deep (see the comment for current_ast_depth). Throw
// an exception to fail fast and not use this query as an etalon, or we'll
// end up in a very slow and useless loop. It also makes sense to set it
// lower than the default max parse depth on the server (1000), so that
// we don't get the useless error about parse depth from the server either.
throw Exception(ErrorCodes::TOO_DEEP_RECURSION,
"AST depth exceeded while fuzzing ({})", current_ast_depth);
}
// Check for loops.
auto [_, inserted] = debug_visited_nodes.insert(ast.get());
if (!inserted)
{
fmt::print(stderr, "The AST node '{}' was already visited before."
" Depth {}, {} visited nodes, current top AST:\n{}\n",
static_cast<void *>(ast.get()), current_ast_depth,
debug_visited_nodes.size(), (*debug_top_ast)->dumpTree());
assert(false);
}
// The fuzzing.
if (auto * with_union = typeid_cast<ASTSelectWithUnionQuery *>(ast.get()))
{
fuzz(with_union->list_of_selects);
@ -281,17 +394,28 @@ void QueryFuzzer::fuzz(ASTPtr & ast)
{
fuzz(expr_list->children);
}
else if (auto * order_by_element = typeid_cast<ASTOrderByElement *>(ast.get()))
{
fuzzOrderByElement(order_by_element);
}
else if (auto * fn = typeid_cast<ASTFunction *>(ast.get()))
{
fuzzColumnLikeExpressionList(fn->arguments);
fuzzColumnLikeExpressionList(fn->parameters);
fuzzColumnLikeExpressionList(fn->arguments.get());
fuzzColumnLikeExpressionList(fn->parameters.get());
if (fn->is_window_function)
{
fuzzColumnLikeExpressionList(fn->window_partition_by);
fuzzOrderByList(fn->window_order_by);
}
fuzz(fn->children);
}
else if (auto * select = typeid_cast<ASTSelectQuery *>(ast.get()))
{
fuzzColumnLikeExpressionList(select->select());
fuzzColumnLikeExpressionList(select->groupBy());
fuzzColumnLikeExpressionList(select->select().get());
fuzzColumnLikeExpressionList(select->groupBy().get());
fuzzOrderByList(select->orderBy().get());
fuzz(select->children);
}
@ -416,6 +540,10 @@ void QueryFuzzer::collectFuzzInfoRecurse(const ASTPtr ast)
void QueryFuzzer::fuzzMain(ASTPtr & ast)
{
current_ast_depth = 0;
debug_visited_nodes.clear();
debug_top_ast = &ast;
collectFuzzInfoMain(ast);
fuzz(ast);

View File

@ -12,6 +12,9 @@
namespace DB
{
class ASTExpressionList;
class ASTOrderByElement;
/*
* This is an AST-based query fuzzer that makes random modifications to query
* AST, changing numbers, list of columns, functions, etc. It remembers part of
@ -23,6 +26,13 @@ struct QueryFuzzer
{
pcg64 fuzz_rand{randomSeed()};
// We add elements to expression lists with fixed probability. Some elements
// are so large, that the expected number of elements we add to them is
// one or higher, hence this process might never finish. Put some limit on the
// total depth of AST to prevent this.
// This field is reset for each fuzzMain() call.
size_t current_ast_depth = 0;
// These arrays hold parts of queries that we can substitute into the query
// we are currently fuzzing. We add some part from each new query we are asked
// to fuzz, and keep this state between queries, so the fuzzing output becomes
@ -36,6 +46,12 @@ struct QueryFuzzer
std::unordered_map<std::string, ASTPtr> table_like_map;
std::vector<ASTPtr> table_like;
// Some debug fields for detecting problematic ASTs with loops.
// These are reset for each fuzzMain call.
std::unordered_set<const IAST *> debug_visited_nodes;
ASTPtr * debug_top_ast;
// This is the only function you have to call -- it will modify the passed
// ASTPtr to point to new AST with some random changes.
void fuzzMain(ASTPtr & ast);
@ -46,7 +62,9 @@ struct QueryFuzzer
ASTPtr getRandomColumnLike();
void replaceWithColumnLike(ASTPtr & ast);
void replaceWithTableLike(ASTPtr & ast);
void fuzzColumnLikeExpressionList(ASTPtr ast);
void fuzzOrderByElement(ASTOrderByElement * elem);
void fuzzOrderByList(IAST * ast);
void fuzzColumnLikeExpressionList(IAST * ast);
void fuzz(ASTs & asts);
void fuzz(ASTPtr & ast);
void collectFuzzInfoMain(const ASTPtr ast);

View File

@ -19,6 +19,7 @@ namespace ErrorCodes
/// Checks expected server and client error codes in testmode.
/// To enable it add special comment after the query: "-- { serverError 60 }" or "-- { clientError 20 }".
/// Also you can enable echoing all queries by writing "-- { echo }".
class TestHint
{
public:
@ -84,12 +85,14 @@ public:
int serverError() const { return server_error; }
int clientError() const { return client_error; }
bool echoQueries() const { return echo; }
private:
bool enabled = false;
const String & query;
int server_error = 0;
int client_error = 0;
bool echo = false;
void parse(const String & hint)
{
@ -107,6 +110,8 @@ private:
ss >> server_error;
else if (item == "clientError")
ss >> client_error;
else if (item == "echo")
echo = true;
}
}

View File

@ -10,6 +10,10 @@
#include <linux/capability.h>
#endif
#if defined(OS_DARWIN)
#include <mach-o/dyld.h>
#endif
#include <Common/Exception.h>
#include <Common/ShellCommand.h>
#include <Common/formatReadable.h>
@ -147,9 +151,24 @@ int mainEntryClickHouseInstall(int argc, char ** argv)
try
{
/// We need to copy binary to the binary directory.
/// The binary is currently run. We need to obtain its path from procfs.
/// The binary is currently run. We need to obtain its path from procfs (on Linux).
#if defined(OS_DARWIN)
uint32_t path_length = 0;
_NSGetExecutablePath(nullptr, &path_length);
if (path_length <= 1)
Exception(ErrorCodes::FILE_DOESNT_EXIST, "Cannot obtain path to the binary");
std::string path(path_length, std::string::value_type());
auto res = _NSGetExecutablePath(&path[0], &path_length);
if (res != 0)
Exception(ErrorCodes::FILE_DOESNT_EXIST, "Cannot obtain path to the binary");
fs::path binary_self_path(path);
#else
fs::path binary_self_path = "/proc/self/exe";
#endif
if (!fs::exists(binary_self_path))
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Cannot obtain path to the binary from {}, file doesn't exist",
binary_self_path.string());

View File

@ -4,6 +4,7 @@
#include <sys/resource.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <errno.h>
#include <pwd.h>
#include <unistd.h>
@ -103,6 +104,7 @@ namespace CurrentMetrics
int mainEntryClickHouseServer(int argc, char ** argv)
{
DB::Server app;
app.shouldSetupWatchdog(argc ? argv[0] : nullptr);
try
{
return app.run(argc, argv);
@ -366,6 +368,7 @@ void checkForUsersNotInMainConfig(
int Server::main(const std::vector<std::string> & /*args*/)
{
Poco::Logger * log = &logger();
UseSSL use_ssl;
MainThreadStatus::getInstance();

View File

@ -127,10 +127,10 @@ public:
void insertResultInto(AggregateDataPtr place, IColumn & to, Arena *) const override
{
if constexpr (IsDecimalNumber<Numerator> || IsDecimalNumber<Denominator>)
static_cast<ColumnVector<Float64> &>(to).getData().push_back(
assert_cast<ColumnVector<Float64> &>(to).getData().push_back(
this->data(place).divideIfAnyDecimal(num_scale, denom_scale));
else
static_cast<ColumnVector<Float64> &>(to).getData().push_back(this->data(place).divide());
assert_cast<ColumnVector<Float64> &>(to).getData().push_back(this->data(place).divide());
}
private:
UInt32 num_scale;

View File

@ -104,9 +104,12 @@ public:
return false;
}
/// Inserts results into a column.
/// This method must be called once, from single thread.
/// After this method was called for state, you can't do anything with state but destroy.
/// Inserts results into a column. This method might modify the state (e.g.
/// sort an array), so must be called once, from single thread. The state
/// must remain valid though, and the subsequent calls to add/merge/
/// insertResultInto must work correctly. This kind of call sequence occurs
/// in `runningAccumulate`, or when calculating an aggregate function as a
/// window function.
virtual void insertResultInto(AggregateDataPtr place, IColumn & to, Arena * arena) const = 0;
/// Used for machine learning methods. Predict result from trained model.

View File

@ -436,6 +436,8 @@ if (USE_ROCKSDB)
dbms_target_include_directories(SYSTEM BEFORE PUBLIC ${ROCKSDB_INCLUDE_DIR})
endif()
dbms_target_link_libraries(PRIVATE _boost_context)
if (ENABLE_TESTS AND USE_GTEST)
macro (grep_gtest_sources BASE_DIR DST_VAR)
# Cold match files that are not in tests/ directories

View File

@ -742,8 +742,11 @@ std::optional<UInt64> Connection::checkPacket(size_t timeout_microseconds)
}
Packet Connection::receivePacket()
Packet Connection::receivePacket(std::function<void(Poco::Net::Socket &)> async_callback)
{
in->setAsyncCallback(std::move(async_callback));
SCOPE_EXIT(in->setAsyncCallback({}));
try
{
Packet res;

View File

@ -18,6 +18,7 @@
#include <DataStreams/BlockStreamProfileInfo.h>
#include <IO/ConnectionTimeouts.h>
#include <IO/ReadBufferFromPocoSocket.h>
#include <Interpreters/TablesStatus.h>
@ -171,7 +172,8 @@ public:
std::optional<UInt64> checkPacket(size_t timeout_microseconds = 0);
/// Receive packet from server.
Packet receivePacket();
/// Each time read blocks and async_callback is set, it will be called. You can poll socket inside it.
Packet receivePacket(std::function<void(Poco::Net::Socket &)> async_callback = {});
/// If not connected yet, or if connection is broken - then connect. If cannot connect - throw an exception.
void forceConnected(const ConnectionTimeouts & timeouts);
@ -226,7 +228,7 @@ private:
String server_display_name;
std::unique_ptr<Poco::Net::StreamSocket> socket;
std::shared_ptr<ReadBuffer> in;
std::shared_ptr<ReadBufferFromPocoSocket> in;
std::shared_ptr<WriteBuffer> out;
std::optional<UInt64> last_input_packet_type;

View File

@ -237,7 +237,7 @@ std::string MultiplexedConnections::dumpAddressesUnlocked() const
return buf.str();
}
Packet MultiplexedConnections::receivePacketUnlocked()
Packet MultiplexedConnections::receivePacketUnlocked(std::function<void(Poco::Net::Socket &)> async_callback)
{
if (!sent_query)
throw Exception("Cannot receive packets: no query sent.", ErrorCodes::LOGICAL_ERROR);
@ -249,7 +249,7 @@ Packet MultiplexedConnections::receivePacketUnlocked()
if (current_connection == nullptr)
throw Exception("Logical error: no available replica", ErrorCodes::NO_AVAILABLE_REPLICA);
Packet packet = current_connection->receivePacket();
Packet packet = current_connection->receivePacket(std::move(async_callback));
switch (packet.type)
{

View File

@ -69,7 +69,7 @@ public:
private:
/// Internal version of `receivePacket` function without locking.
Packet receivePacketUnlocked();
Packet receivePacketUnlocked(std::function<void(Poco::Net::Socket &)> async_callback = {});
/// Internal version of `dumpAddresses` function without locking.
std::string dumpAddressesUnlocked() const;
@ -105,6 +105,8 @@ private:
/// A mutex for the sendCancel function to execute safely
/// in separate thread.
mutable std::mutex cancel_mutex;
friend class RemoteQueryExecutorReadContext;
};
}

5
src/Common/Fiber.h Normal file
View File

@ -0,0 +1,5 @@
#pragma once
#include <common/defines.h>
#include <boost/context/fiber.hpp>
using Fiber = boost::context::fiber;

74
src/Common/FiberStack.h Normal file
View File

@ -0,0 +1,74 @@
#pragma once
#include <common/defines.h>
#include <boost/context/stack_context.hpp>
#include <Common/formatReadable.h>
#include <Common/MemoryTracker.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/mman.h>
#if defined(BOOST_USE_VALGRIND)
#include <valgrind/valgrind.h>
#endif
namespace DB::ErrorCodes
{
extern const int CANNOT_ALLOCATE_MEMORY;
}
/// This is an implementation of allocator for fiber stack.
/// The reference implementation is protected_fixedsize_stack from boost::context.
/// This implementation additionally track memory usage. It is the main reason why it is needed.
class FiberStack
{
private:
size_t stack_size;
size_t page_size = 0;
public:
static constexpr size_t default_stack_size = 128 * 1024; /// 64KB was not enough for tests
explicit FiberStack(size_t stack_size_ = default_stack_size) : stack_size(stack_size_)
{
page_size = ::sysconf(_SC_PAGESIZE);
}
boost::context::stack_context allocate()
{
size_t num_pages = 1 + (stack_size - 1) / page_size;
size_t num_bytes = (num_pages + 1) * page_size; /// Add one page at bottom that will be used as guard-page
void * vp = ::mmap(nullptr, num_bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (MAP_FAILED == vp)
DB::throwFromErrno(fmt::format("FiberStack: Cannot mmap {}.", ReadableSize(num_bytes)), DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY);
if (-1 == ::mprotect(vp, page_size, PROT_NONE))
{
::munmap(vp, num_bytes);
DB::throwFromErrno("FiberStack: cannot protect guard page", DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY);
}
/// Do not count guard page in memory usage.
CurrentMemoryTracker::alloc(num_pages * page_size);
boost::context::stack_context sctx;
sctx.size = num_bytes;
sctx.sp = static_cast< char * >(vp) + sctx.size;
#if defined(BOOST_USE_VALGRIND)
sctx.valgrind_stack_id = VALGRIND_STACK_REGISTER(sctx.sp, vp);
#endif
return sctx;
}
void deallocate(boost::context::stack_context & sctx)
{
#if defined(BOOST_USE_VALGRIND)
VALGRIND_STACK_DEREGISTER(sctx.valgrind_stack_id);
#endif
void * vp = static_cast< char * >(sctx.sp) - sctx.size;
::munmap(vp, sctx.size);
/// Do not count guard page in memory usage.
CurrentMemoryTracker::free(sctx.size - page_size);
}
};

View File

@ -10,6 +10,7 @@
#include <common/sleep.h>
#include <IO/ReadHelpers.h>
#include <common/logger_useful.h>
#include <Common/Exception.h>
#include <Common/thread_local_rng.h>

View File

@ -0,0 +1,84 @@
#if defined(OS_LINUX)
#include <Common/TimerDescriptor.h>
#include <Common/Exception.h>
#include <sys/timerfd.h>
#include <fcntl.h>
#include <unistd.h>
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_CREATE_TIMER;
extern const int CANNOT_SET_TIMER_PERIOD;
extern const int CANNOT_FCNTL;
extern const int CANNOT_READ_FROM_SOCKET;
}
TimerDescriptor::TimerDescriptor(int clockid, int flags)
{
timer_fd = timerfd_create(clockid, flags);
if (timer_fd == -1)
throw Exception(ErrorCodes::CANNOT_CREATE_TIMER, "Cannot create timer_fd descriptor");
if (-1 == fcntl(timer_fd, F_SETFL, O_NONBLOCK))
throwFromErrno("Cannot set O_NONBLOCK for timer_fd", ErrorCodes::CANNOT_FCNTL);
}
TimerDescriptor::~TimerDescriptor()
{
/// Do not check for result cause cannot throw exception.
close(timer_fd);
}
void TimerDescriptor::reset() const
{
itimerspec spec;
spec.it_interval.tv_nsec = 0;
spec.it_interval.tv_sec = 0;
spec.it_value.tv_sec = 0;
spec.it_value.tv_nsec = 0;
if (-1 == timerfd_settime(timer_fd, 0 /*relative timer */, &spec, nullptr))
throwFromErrno("Cannot reset timer_fd", ErrorCodes::CANNOT_SET_TIMER_PERIOD);
/// Drain socket.
/// It may be possible that alarm happened and socket is readable.
drain();
}
void TimerDescriptor::drain() const
{
/// It is expected that socket returns 8 bytes when readable.
/// Read in loop anyway cause signal may interrupt read call.
uint64_t buf;
while (true)
{
ssize_t res = ::read(timer_fd, &buf, sizeof(buf));
if (res < 0)
{
if (errno == EAGAIN)
break;
if (errno != EINTR)
throwFromErrno("Cannot drain timer_fd", ErrorCodes::CANNOT_READ_FROM_SOCKET);
}
}
}
void TimerDescriptor::setRelative(const Poco::Timespan & timespan) const
{
itimerspec spec;
spec.it_interval.tv_nsec = 0;
spec.it_interval.tv_sec = 0;
spec.it_value.tv_sec = timespan.totalSeconds();
spec.it_value.tv_nsec = timespan.useconds();
if (-1 == timerfd_settime(timer_fd, 0 /*relative timer */, &spec, nullptr))
throwFromErrno("Cannot set time for timer_fd", ErrorCodes::CANNOT_SET_TIMER_PERIOD);
}
}
#endif

View File

@ -0,0 +1,31 @@
#pragma once
#if defined(OS_LINUX)
#include <Poco/Timespan.h>
namespace DB
{
/// Wrapper over timerfd.
class TimerDescriptor
{
private:
int timer_fd;
public:
explicit TimerDescriptor(int clockid, int flags);
~TimerDescriptor();
TimerDescriptor(const TimerDescriptor &) = delete;
TimerDescriptor & operator=(const TimerDescriptor &) = delete;
TimerDescriptor(TimerDescriptor &&) = default;
TimerDescriptor & operator=(TimerDescriptor &&) = default;
int getDescriptor() const { return timer_fd; }
void reset() const;
void drain() const;
void setRelative(const Poco::Timespan & timespan) const;
};
}
#endif

View File

@ -4,9 +4,6 @@ target_link_libraries(zkutil_test_commands PRIVATE clickhouse_common_zookeeper)
add_executable(zkutil_test_commands_new_lib zkutil_test_commands_new_lib.cpp)
target_link_libraries(zkutil_test_commands_new_lib PRIVATE clickhouse_common_zookeeper string_utils)
add_executable(zkutil_expiration_test zkutil_expiration_test.cpp)
target_link_libraries(zkutil_expiration_test PRIVATE clickhouse_common_zookeeper)
add_executable(zkutil_test_async zkutil_test_async.cpp)
target_link_libraries(zkutil_test_async PRIVATE clickhouse_common_zookeeper)

View File

@ -1,15 +0,0 @@
#!/usr/bin/env bash
# Добавляет в файрвол правила, не пропускающие пакеты до серверов ZooKeeper.
# Используется для тестирования поведения программ при потере соединения с ZooKeeper.
# yeszk.sh производит обратные изменения.
# Чтобы посмотреть, какие правила сейчас есть, используйте sudo iptables -L и sudo ip6tables -L
sudo iptables -A OUTPUT -p tcp --dport 2181 -j DROP
sudo ip6tables -A OUTPUT -p tcp --dport 2181 -j DROP
# You could also test random drops:
#sudo iptables -A OUTPUT -p tcp --dport 2181 -j REJECT --reject-with tcp-reset -m statistic --mode random --probability 0.1
#sudo ip6tables -A OUTPUT -p tcp --dport 2181 -j REJECT --reject-with tcp-reset -m statistic --mode random --probability 0.1

View File

@ -1,6 +0,0 @@
#!/usr/bin/env bash
# Выполняет действия, обратные nozk.sh
cat nozk.sh | sed 's/-A/-D/g' | bash

View File

@ -1,70 +0,0 @@
#include <iostream>
#include <Common/ZooKeeper/ZooKeeper.h>
#include <Common/ZooKeeper/KeeperException.h>
#include <Poco/ConsoleChannel.h>
#include <Common/Exception.h>
/// Проверяет, какие ошибки выдает ZooKeeper при попытке сделать какую-нибудь операцию через разное время после истечения сессии.
/// Спойлер: multi иногда падает с segfault, а до этого фейлится с marshalling error.
/// create всегда фейлится с invalid zhandle state.
int main(int argc, char ** argv)
{
try
{
if (argc != 2)
{
std::cerr << "usage: " << argv[0] << " hosts" << std::endl;
return 2;
}
Poco::AutoPtr<Poco::ConsoleChannel> channel = new Poco::ConsoleChannel(std::cerr);
Poco::Logger::root().setChannel(channel);
Poco::Logger::root().setLevel("trace");
zkutil::ZooKeeper zk(argv[1]);
std::string unused;
zk.tryCreate("/test", "", zkutil::CreateMode::Persistent, unused);
std::cerr << "Please run `./nozk.sh && sleep 40s && ./yeszk.sh`" << std::endl;
time_t time0 = time(nullptr);
while (true)
{
{
Coordination::Requests ops;
ops.emplace_back(zkutil::makeCreateRequest("/test/zk_expiration_test", "hello", zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeRemoveRequest("/test/zk_expiration_test", -1));
Coordination::Responses responses;
Coordination::Error code = zk.tryMultiNoThrow(ops, responses);
std::cout << time(nullptr) - time0 << "s: " << Coordination::errorMessage(code) << std::endl;
try
{
if (code != Coordination::Error::ZOK)
std::cout << "Path: " << zkutil::KeeperMultiException(code, ops, responses).getPathForFirstFailedOp() << std::endl;
}
catch (...)
{
std::cout << DB::getCurrentExceptionMessage(false) << std::endl;
}
}
sleep(1);
}
}
catch (Coordination::Exception &)
{
std::cerr << "KeeperException: " << DB::getCurrentExceptionMessage(true) << std::endl;
return 1;
}
catch (...)
{
std::cerr << "Some exception: " << DB::getCurrentExceptionMessage(true) << std::endl;
return 2;
}
}

View File

@ -75,6 +75,7 @@ SRCS(
ThreadPool.cpp
ThreadProfileEvents.cpp
ThreadStatus.cpp
TimerDescriptor.cpp
TraceCollector.cpp
UTF8Helpers.cpp
UnicodeBar.cpp

View File

@ -406,9 +406,12 @@ class IColumn;
M(Bool, optimize_skip_merged_partitions, false, "Skip partitions with one part with level > 0 in optimize final", 0) \
M(Bool, optimize_on_insert, true, "Do the same transformation for inserted block of data as if merge was done on this block.", 0) \
M(Bool, allow_experimental_map_type, false, "Allow data type Map", 0) \
M(Bool, allow_experimental_window_functions, false, "Allow experimental window functions", 0) \
\
M(Bool, use_antlr_parser, false, "Parse incoming queries using ANTLR-generated parser", 0) \
\
M(Bool, async_socket_for_remote, true, "Asynchronously read from socket executing remote query", 0) \
\
/** Obsolete settings that do nothing but left for compatibility reasons. Remove each one after half a year of obsolescence. */ \
\
M(UInt64, max_memory_usage_for_all_queries, 0, "Obsolete. Will be removed after 2020-10-20", 0) \

View File

@ -37,5 +37,12 @@ void dumpSortDescription(const SortDescription & description, const Block & head
}
}
std::string dumpSortDescription(const SortDescription & description)
{
WriteBufferFromOwnString wb;
dumpSortDescription(description, Block{}, wb);
return wb.str();
}
}

View File

@ -72,4 +72,6 @@ class Block;
/// Outputs user-readable description into `out`.
void dumpSortDescription(const SortDescription & description, const Block & header, WriteBuffer & out);
std::string dumpSortDescription(const SortDescription & description);
}

View File

@ -1,4 +1,5 @@
#include <DataStreams/RemoteQueryExecutor.h>
#include <DataStreams/RemoteQueryExecutorReadContext.h>
#include <Columns/ColumnConst.h>
#include <Common/CurrentThread.h>
@ -11,6 +12,7 @@
#include <Interpreters/Context.h>
#include <Interpreters/InternalTextLogsQueue.h>
#include <IO/ConnectionTimeoutsContext.h>
#include <Common/FiberStack.h>
namespace DB
{
@ -192,6 +194,56 @@ Block RemoteQueryExecutor::read()
Packet packet = multiplexed_connections->receivePacket();
if (auto block = processPacket(std::move(packet)))
return *block;
}
}
std::variant<Block, int> RemoteQueryExecutor::read(std::unique_ptr<ReadContext> & read_context [[maybe_unused]])
{
#if defined(OS_LINUX)
if (!sent_query)
{
sendQuery();
if (context.getSettingsRef().skip_unavailable_shards && (0 == multiplexed_connections->size()))
return Block();
}
if (!read_context)
{
std::lock_guard lock(was_cancelled_mutex);
if (was_cancelled)
return Block();
read_context = std::make_unique<ReadContext>(*multiplexed_connections);
}
do
{
if (!read_context->resumeRoutine())
return Block();
if (read_context->is_read_in_progress)
{
read_context->setTimer();
return read_context->epoll_fd;
}
else
{
if (auto data = processPacket(std::move(read_context->packet)))
return std::move(*data);
}
}
while (true);
#else
return read();
#endif
}
std::optional<Block> RemoteQueryExecutor::processPacket(Packet packet)
{
switch (packet.type)
{
case Protocol::Server::Data:
@ -250,10 +302,11 @@ Block RemoteQueryExecutor::read()
toString(packet.type),
multiplexed_connections->dumpAddresses());
}
}
return {};
}
void RemoteQueryExecutor::finish()
void RemoteQueryExecutor::finish(std::unique_ptr<ReadContext> * read_context)
{
/** If one of:
* - nothing started to do;
@ -270,7 +323,7 @@ void RemoteQueryExecutor::finish()
*/
/// Send the request to abort the execution of the request, if not already sent.
tryCancel("Cancelling query because enough data has been read");
tryCancel("Cancelling query because enough data has been read", read_context);
/// Get the remaining packets so that there is no out of sync in the connections to the replicas.
Packet packet = multiplexed_connections->drain();
@ -299,7 +352,7 @@ void RemoteQueryExecutor::finish()
}
}
void RemoteQueryExecutor::cancel()
void RemoteQueryExecutor::cancel(std::unique_ptr<ReadContext> * read_context)
{
{
std::lock_guard lock(external_tables_mutex);
@ -313,7 +366,7 @@ void RemoteQueryExecutor::cancel()
if (!isQueryPending() || hasThrownException())
return;
tryCancel("Cancelling query");
tryCancel("Cancelling query", read_context);
}
void RemoteQueryExecutor::sendScalars()
@ -365,7 +418,7 @@ void RemoteQueryExecutor::sendExternalTables()
multiplexed_connections->sendExternalTablesData(external_tables_data);
}
void RemoteQueryExecutor::tryCancel(const char * reason)
void RemoteQueryExecutor::tryCancel(const char * reason, std::unique_ptr<ReadContext> * read_context)
{
{
/// Flag was_cancelled is atomic because it is checked in read().
@ -375,6 +428,10 @@ void RemoteQueryExecutor::tryCancel(const char * reason)
return;
was_cancelled = true;
if (read_context && *read_context)
(*read_context)->cancel();
multiplexed_connections->sendCancel();
}

View File

@ -5,6 +5,9 @@
#include <Storages/IStorage_fwd.h>
#include <Interpreters/Context.h>
#include <Interpreters/StorageID.h>
#include <Common/FiberStack.h>
#include <Common/TimerDescriptor.h>
#include <variant>
namespace DB
{
@ -20,10 +23,14 @@ using ProgressCallback = std::function<void(const Progress & progress)>;
struct BlockStreamProfileInfo;
using ProfileInfoCallback = std::function<void(const BlockStreamProfileInfo & info)>;
class RemoteQueryExecutorReadContext;
/// This class allows one to launch queries on remote replicas of one shard and get results
class RemoteQueryExecutor
{
public:
using ReadContext = RemoteQueryExecutorReadContext;
/// Takes already set connection.
RemoteQueryExecutor(
Connection & connection,
@ -53,13 +60,17 @@ public:
/// Read next block of data. Returns empty block if query is finished.
Block read();
/// Async variant of read. Returns ready block or file descriptor which may be used for polling.
/// ReadContext is an internal read state. Pass empty ptr first time, reuse created one for every call.
std::variant<Block, int> read(std::unique_ptr<ReadContext> & read_context);
/// Receive all remain packets and finish query.
/// It should be cancelled after read returned empty block.
void finish();
void finish(std::unique_ptr<ReadContext> * read_context = nullptr);
/// Cancel query execution. Sends Cancel packet and ignore others.
/// This method may be called from separate thread.
void cancel();
void cancel(std::unique_ptr<ReadContext> * read_context = nullptr);
/// Get totals and extremes if any.
Block getTotals() { return std::move(totals); }
@ -153,13 +164,16 @@ private:
void sendExternalTables();
/// If wasn't sent yet, send request to cancel all connections to replicas
void tryCancel(const char * reason);
void tryCancel(const char * reason, std::unique_ptr<ReadContext> * read_context);
/// Returns true if query was sent
bool isQueryPending() const;
/// Returns true if exception was thrown
bool hasThrownException() const;
/// Process packet for read and return data block if possible.
std::optional<Block> processPacket(Packet packet);
};
}

View File

@ -0,0 +1,272 @@
#pragma once
#if defined(OS_LINUX)
#include <sys/epoll.h>
#include <Common/Fiber.h>
#include <Common/FiberStack.h>
#include <Common/TimerDescriptor.h>
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_READ_FROM_SOCKET;
extern const int CANNOT_OPEN_FILE;
extern const int SOCKET_TIMEOUT;
}
class RemoteQueryExecutorReadContext
{
public:
using Self = RemoteQueryExecutorReadContext;
bool is_read_in_progress = false;
Packet packet;
std::exception_ptr exception;
FiberStack stack;
boost::context::fiber fiber;
/// This mutex for fiber is needed because fiber could be destroyed in cancel method from another thread.
std::mutex fiber_lock;
Poco::Timespan receive_timeout;
MultiplexedConnections & connections;
Poco::Net::Socket * last_used_socket = nullptr;
/// Here we have three descriptors we are going to wait:
/// * socket_fd is a descriptor of connection. It may be changed in case of reading from several replicas.
/// * timer is a timerfd descriptor to manually check socket timeout
/// * pipe_fd is a pipe we use to cancel query and socket polling by executor.
/// We put those descriptors into our own epoll_fd which is used by external executor.
TimerDescriptor timer{CLOCK_MONOTONIC, 0};
int socket_fd = -1;
int epoll_fd;
int pipe_fd[2];
explicit RemoteQueryExecutorReadContext(MultiplexedConnections & connections_) : connections(connections_)
{
epoll_fd = epoll_create(2);
if (-1 == epoll_fd)
throwFromErrno("Cannot create epoll descriptor", ErrorCodes::CANNOT_OPEN_FILE);
if (-1 == pipe2(pipe_fd, O_NONBLOCK))
throwFromErrno("Cannot create pipe", ErrorCodes::CANNOT_OPEN_FILE);
{
epoll_event socket_event;
socket_event.events = EPOLLIN | EPOLLPRI;
socket_event.data.fd = pipe_fd[0];
if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, pipe_fd[0], &socket_event))
throwFromErrno("Cannot add pipe descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE);
}
{
epoll_event timer_event;
timer_event.events = EPOLLIN | EPOLLPRI;
timer_event.data.fd = timer.getDescriptor();
if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, timer_event.data.fd, &timer_event))
throwFromErrno("Cannot add timer descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE);
}
auto routine = Routine{connections, *this};
fiber = boost::context::fiber(std::allocator_arg_t(), stack, std::move(routine));
}
void setSocket(Poco::Net::Socket & socket)
{
int fd = socket.impl()->sockfd();
if (fd == socket_fd)
return;
epoll_event socket_event;
socket_event.events = EPOLLIN | EPOLLPRI;
socket_event.data.fd = fd;
if (socket_fd != -1)
{
if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_DEL, socket_fd, &socket_event))
throwFromErrno("Cannot remove socket descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE);
}
socket_fd = fd;
if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, socket_fd, &socket_event))
throwFromErrno("Cannot add socket descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE);
receive_timeout = socket.impl()->getReceiveTimeout();
}
bool checkTimeout() const
{
try
{
return checkTimeoutImpl();
}
catch (DB::Exception & e)
{
if (last_used_socket)
e.addMessage(" while reading from socket ({})", last_used_socket->peerAddress().toString());
throw;
}
}
bool checkTimeoutImpl() const
{
epoll_event events[3];
events[0].data.fd = events[1].data.fd = events[2].data.fd = -1;
/// Wait for epoll_fd will not block if it was polled externally.
int num_events = epoll_wait(epoll_fd, events, 3, 0);
if (num_events == -1)
throwFromErrno("Failed to epoll_wait", ErrorCodes::CANNOT_READ_FROM_SOCKET);
bool is_socket_ready = false;
bool is_pipe_alarmed = false;
bool has_timer_alarm = false;
for (int i = 0; i < num_events; ++i)
{
if (events[i].data.fd == socket_fd)
is_socket_ready = true;
if (events[i].data.fd == timer.getDescriptor())
has_timer_alarm = true;
if (events[i].data.fd == pipe_fd[0])
is_pipe_alarmed = true;
}
if (is_pipe_alarmed)
return false;
if (has_timer_alarm && !is_socket_ready)
{
/// Socket receive timeout. Drain it in case or error, or it may be hide by timeout exception.
timer.drain();
throw NetException("Timeout exceeded", ErrorCodes::SOCKET_TIMEOUT);
}
return true;
}
void setTimer() const
{
/// Did not get packet yet. Init timeout for the next async reading.
timer.reset();
if (receive_timeout.totalMicroseconds())
timer.setRelative(receive_timeout);
}
bool resumeRoutine()
{
if (is_read_in_progress && !checkTimeout())
return false;
{
std::lock_guard guard(fiber_lock);
if (!fiber)
return false;
fiber = std::move(fiber).resume();
}
if (exception)
std::rethrow_exception(std::move(exception));
return true;
}
void cancel()
{
std::lock_guard guard(fiber_lock);
/// It is safe to just destroy fiber - we are not in the process of reading from socket.
boost::context::fiber to_destroy = std::move(fiber);
/// Send something to pipe to cancel executor waiting.
uint64_t buf = 0;
while (-1 == write(pipe_fd[1], &buf, sizeof(buf)))
{
if (errno == EAGAIN)
break;
if (errno != EINTR)
throwFromErrno("Cannot write to pipe", ErrorCodes::CANNOT_READ_FROM_SOCKET);
}
}
~RemoteQueryExecutorReadContext()
{
/// socket_fd is closed by Poco::Net::Socket
/// timer_fd is closed by TimerDescriptor
close(epoll_fd);
}
struct Routine
{
MultiplexedConnections & connections;
Self & read_context;
struct ReadCallback
{
Self & read_context;
Fiber & fiber;
void operator()(Poco::Net::Socket & socket)
{
try
{
read_context.setSocket(socket);
}
catch (DB::Exception & e)
{
e.addMessage(" while reading from socket ({})", socket.peerAddress().toString());
throw;
}
read_context.is_read_in_progress = true;
fiber = std::move(fiber).resume();
read_context.is_read_in_progress = false;
}
};
Fiber operator()(Fiber && sink) const
{
try
{
while (true)
{
read_context.packet = connections.receivePacketUnlocked(ReadCallback{read_context, sink});
sink = std::move(sink).resume();
}
}
catch (const boost::context::detail::forced_unwind &)
{
/// This exception is thrown by fiber implementation in case if fiber is being deleted but hasn't exited
/// It should not be caught or it will segfault.
/// Other exceptions must be caught
throw;
}
catch (...)
{
read_context.exception = std::current_exception();
}
return std::move(sink);
}
};
};
}
#else
namespace DB
{
class RemoteQueryExecutorReadContext
{
public:
void cancel() {}
};
}
#endif

View File

@ -0,0 +1,166 @@
#pragma once
#include <Common/PODArray.h>
#include <Common/StringUtils/StringUtils.h>
#include <Common/UTF8Helpers.h>
#include <algorithm>
#include <climits>
#include <cstring>
#include <memory>
#include <utility>
#ifdef __SSE4_2__
# include <nmmintrin.h>
#endif
namespace DB
{
// used by FunctionsStringSimilarity and FunctionsStringHash
// includes extracting ASCII ngram, UTF8 ngram, ASCII word and UTF8 word
template <size_t N, bool CaseInsensitive>
struct ExtractStringImpl
{
/// Padding form ColumnsString. It is a number of bytes we can always read starting from pos if pos < end.
static constexpr size_t default_padding = 16;
/// Functions are read `default_padding - (N - 1)` bytes into the buffer. Window of size N is used.
/// Read copies `N - 1` last bytes from buffer into beginning, and then reads new bytes.
static constexpr size_t buffer_size = default_padding + N - 1;
// the length of code_points = buffer_size
// pos: the current beginning location that we want to copy data
// end: the end location of the string
static ALWAYS_INLINE size_t readASCIICodePoints(UInt8 * code_points, const char *& pos, const char * end)
{
/// Offset before which we copy some data.
constexpr size_t padding_offset = default_padding - N + 1;
/// We have an array like this for ASCII (N == 4, other cases are similar)
/// |a0|a1|a2|a3|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18|
/// And we copy ^^^^^^^^^^^^^^^ these bytes to the start
/// Actually it is enough to copy 3 bytes, but memcpy for 4 bytes translates into 1 instruction
memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(UInt8));
/// Now we have an array
/// |a13|a14|a15|a16|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18|
/// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
/// Doing unaligned read of 16 bytes and copy them like above
/// 16 is also chosen to do two `movups`.
/// Such copying allow us to have 3 codepoints from the previous read to produce the 4-grams with them.
memcpy(code_points + (N - 1), pos, default_padding * sizeof(UInt8));
if constexpr (CaseInsensitive)
{
/// We really need template lambdas with C++20 to do it inline
unrollLowering<N - 1>(code_points, std::make_index_sequence<padding_offset>());
}
pos += padding_offset;
if (pos > end)
return default_padding - (pos - end);
return default_padding;
}
// read a ASCII word
static ALWAYS_INLINE inline size_t readOneASCIIWord(PaddedPODArray<UInt8> & word_buf, const char *& pos, const char * end)
{
// jump seperators
while (pos < end && !isAlphaNumericASCII(*pos))
++pos;
// word start from here
const char * word_start = pos;
while (pos < end && isAlphaNumericASCII(*pos))
++pos;
word_buf.assign(word_start, pos);
if (CaseInsensitive)
{
std::transform(word_buf.begin(), word_buf.end(), word_buf.begin(), [](UInt8 c) { return std::tolower(c); });
}
return word_buf.size();
}
static ALWAYS_INLINE inline size_t readUTF8CodePoints(UInt32 * code_points, const char *& pos, const char * end)
{
memcpy(code_points, code_points + default_padding - N + 1, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(UInt32));
size_t num = N - 1;
while (num < default_padding && pos < end)
{
code_points[num++] = readOneUTF8Code(pos, end);
}
return num;
}
// read one UTF8 word from pos to word
static ALWAYS_INLINE inline size_t readOneUTF8Word(PaddedPODArray<UInt32> & word_buf, const char *& pos, const char * end)
{
// jump UTF8 seperator
while (pos < end && isUTF8Sep(*pos))
++pos;
word_buf.clear();
// UTF8 word's character number
while (pos < end && !isUTF8Sep(*pos))
{
word_buf.push_back(readOneUTF8Code(pos, end));
}
return word_buf.size();
}
private:
template <size_t Offset, typename Container, size_t... I>
static ALWAYS_INLINE inline void unrollLowering(Container & cont, const std::index_sequence<I...> &)
{
((cont[Offset + I] = std::tolower(cont[Offset + I])), ...);
}
// we use ASCII non-alphanum character as UTF8 seperator
static ALWAYS_INLINE inline bool isUTF8Sep(const UInt8 c) { return c < 128 && !isAlphaNumericASCII(c); }
// read one UTF8 character and return it
static ALWAYS_INLINE inline UInt32 readOneUTF8Code(const char *& pos, const char * end)
{
size_t length = UTF8::seqLength(*pos);
if (pos + length > end)
length = end - pos;
UInt32 res;
switch (length)
{
case 1:
res = 0;
memcpy(&res, pos, 1);
break;
case 2:
res = 0;
memcpy(&res, pos, 2);
break;
case 3:
res = 0;
memcpy(&res, pos, 3);
break;
default:
memcpy(&res, pos, 4);
}
if constexpr (CaseInsensitive)
{
switch (length)
{
case 4:
res &= ~(1u << (5 + 3 * CHAR_BIT));
[[fallthrough]];
case 3:
res &= ~(1u << (5 + 2 * CHAR_BIT));
[[fallthrough]];
case 2:
res &= ~(1u);
res &= ~(1u << (5 + CHAR_BIT));
[[fallthrough]];
default:
res &= ~(1u << 5);
}
}
pos += length;
return res;
}
};
}

View File

@ -0,0 +1,626 @@
#include <Functions/FunctionsStringHash.h>
#include <Functions/ExtractString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionsHashing.h>
#include <Common/HashTable/ClearableHashMap.h>
#include <Common/HashTable/Hash.h>
#include <Common/PODArray.h>
#include <Core/Defines.h>
#include <bitset>
#include <functional>
#include <memory>
#include <tuple>
#include <vector>
#include <common/unaligned.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
struct Hash
{
static UInt64 crc32u64(UInt64 crc [[maybe_unused]], UInt64 val [[maybe_unused]])
{
#ifdef __SSE4_2__
return _mm_crc32_u64(crc, val);
#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
return __crc32cd(crc, val);
#else
throw Exception("String hash is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED);
#endif
}
static UInt64 crc32u32(UInt64 crc [[maybe_unused]], UInt32 val [[maybe_unused]])
{
#ifdef __SSE4_2__
return _mm_crc32_u32(crc, val);
#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
return __crc32cw(crc, val);
#else
throw Exception("String hash is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED);
#endif
}
static UInt64 crc32u8(UInt64 crc [[maybe_unused]], UInt8 val [[maybe_unused]])
{
#ifdef __SSE4_2__
return _mm_crc32_u8(crc, val);
#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
return __crc32cb(crc, val);
#else
throw Exception("String hash is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED);
#endif
}
static ALWAYS_INLINE inline UInt64 ngramASCIIHash(const UInt8 * code_points)
{
return crc32u64(-1ULL, unalignedLoad<UInt32>(code_points));
}
static ALWAYS_INLINE inline UInt64 ngramUTF8Hash(const UInt32 * code_points)
{
UInt64 crc = -1ULL;
crc = crc32u64(crc, code_points[0]);
crc = crc32u64(crc, code_points[1]);
crc = crc32u64(crc, code_points[2]);
return crc;
}
static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt64 * hashes, size_t size, size_t offset)
{
UInt64 crc1 = -1ULL;
UInt64 crc2 = -1ULL;
for (size_t i = offset; i < size; i += 2)
crc1 = crc32u64(crc1, hashes[i]);
for (size_t i = offset + 1; i < size; i += 2)
crc2 = crc32u64(crc2, hashes[i]);
if ((size - offset) & 1)
{
for (size_t i = 0; i < offset; i += 2)
crc2 = crc32u64(crc2, hashes[i]);
for (size_t i = 1; i < offset; i += 2)
crc1 = crc32u64(crc1, hashes[i]);
}
else
{
for (size_t i = 0; i < offset; i += 2)
crc1 = crc32u64(crc1, hashes[i]);
for (size_t i = 1; i < offset; i += 2)
crc2 = crc32u64(crc2, hashes[i]);
}
return crc1 | (crc2 << 32u);
}
static ALWAYS_INLINE inline UInt64 hashSum(const UInt8 * hashes [[maybe_unused]], size_t K [[maybe_unused]])
{
UInt64 crc1 = -1ULL;
UInt64 crc2 = -1ULL;
for (size_t i = 0; i < K; i += 2)
crc1 = crc32u8(crc1, hashes[i]);
for (size_t i = 1; i < K; i += 2)
crc2 = crc32u8(crc2, hashes[i]);
return crc1 | (crc2 << 32u);
}
static ALWAYS_INLINE inline UInt64 hashSum(const UInt32 * hashes [[maybe_unused]], size_t K [[maybe_unused]])
{
UInt64 crc1 = -1ULL;
UInt64 crc2 = -1ULL;
for (size_t i = 0; i < K; i += 2)
crc1 = crc32u32(crc1, hashes[i]);
for (size_t i = 1; i < K; i += 2)
crc2 = crc32u32(crc2, hashes[i]);
return crc1 | (crc2 << 32u);
}
static ALWAYS_INLINE inline UInt64 hashSum(const UInt64 * hashes, size_t K)
{
UInt64 crc1 = -1ULL;
UInt64 crc2 = -1ULL;
for (size_t i = 0; i < K; i += 2)
crc1 = crc32u64(crc1, hashes[i]);
for (size_t i = 1; i < K; i += 2)
crc2 = crc32u64(crc2, hashes[i]);
return crc1 | (crc2 << 32u);
}
};
// Simhash String -> UInt64
// N: the length of ngram or words shingles
// CodePoint: UInt8(ASCII) or UInt32(UTF8)
// UTF8: means ASCII or UTF8, these two parameters CodePoint and UTF8 can only be (UInt8, false) or (UInt32, true)
// Ngram: means ngram(true) or words shingles(false)
// CaseInsensitive: means should we consider about letter case or not
template <size_t N, typename CodePoint, bool UTF8, bool Ngram, bool CaseInsensitive>
struct SimhashImpl
{
using StrOp = ExtractStringImpl<N, CaseInsensitive>;
// we made an assumption that the size of one word cann't exceed 128, which may not true
// if some word's size exceed 128, it would be cut up to several word
static constexpr size_t max_string_size = 1u << 15;
static constexpr size_t simultaneously_codepoints_num = StrOp::buffer_size;
// Simhash ngram calculate function: String ->UInt64
// this function extracting ngram from input string, and maintain a 64-dimensions vector
// for each ngram, calculate a 64 bit hash value, and update the vector according the hash value
// finally return a 64 bit value(UInt64), i'th bit is 1 means vector[i] > 0, otherwise, vector[i] < 0
static ALWAYS_INLINE inline UInt64 ngramCalculateHashValue(
const char * data,
size_t size,
size_t (*read_code_points)(CodePoint *, const char *&, const char *),
UInt64 (*hash_functor)(const CodePoint *))
{
const char * start = data;
const char * end = data + size;
// fingerprint vector, all dimensions initialized to zero at the first
Int64 finger_vec[64] = {};
CodePoint cp[simultaneously_codepoints_num] = {};
size_t found = read_code_points(cp, start, end);
size_t iter = N - 1;
do
{
for (; iter + N <= found; ++iter)
{
// for each ngram, we can calculate an 64 bit hash
// then update finger_vec according to this hash value
// if the i'th bit is 1, finger_vec[i] plus 1, otherwise minus 1
UInt64 hash_value = hash_functor(cp + iter);
std::bitset<64> bits(hash_value);
for (size_t i = 0; i < 64; ++i)
{
finger_vec[i] += ((bits.test(i)) ? 1 : -1);
}
}
iter = 0;
} while (start < end && (found = read_code_points(cp, start, end)));
// finally, we return a 64 bit value according to finger_vec
// if finger_vec[i] > 0, the i'th bit of the value is 1, otherwise 0
std::bitset<64> res_bit(0u);
for (size_t i = 0; i < 64; ++i)
{
if (finger_vec[i] > 0)
res_bit.set(i);
}
return res_bit.to_ullong();
}
// Simhash word shingle calculate funtion: String -> UInt64
// this function extracting n word shingle from input string, and maintain a 64-dimensions vector as well
// for each word shingle, calculate a 64 bit hash value, and update the vector according the hash value
// finally return a 64 bit value(UInt64), i'th bit is 1 means vector[i] > 0, otherwise, vector[i] < 0
//
// word shingle hash value calculate:
// 1. at the first, extracts N word shingles and calculate N hash values, store into an array, use this N hash values
// to calculate the first word shingle hash value
// 2. next, we extrac one word each time, and calculate a new hash value of the new word,then use the latest N hash
// values to caculate the next word shingle hash value
static ALWAYS_INLINE inline UInt64 wordShinglesCalculateHashValue(
const char * data,
size_t size,
size_t (*read_one_word)(PaddedPODArray<CodePoint> &, const char *&, const char *),
UInt64 (*hash_functor)(const UInt64 *, size_t, size_t))
{
const char * start = data;
const char * end = data + size;
// Also, a 64 bit vector initialized to zero
Int64 finger_vec[64] = {};
// a array to store N word hash values
UInt64 nword_hashes[N] = {};
// word buffer to store one word
PaddedPODArray<CodePoint> word_buf;
// get first word shingle
for (size_t i = 0; i < N && start < end; ++i)
{
read_one_word(word_buf, start, end);
if (!word_buf.empty())
{
// for each word, calculate a hash value and stored into the array
nword_hashes[i++] = Hash::hashSum(word_buf.data(), word_buf.size());
}
}
// calculate the first word shingle hash value
UInt64 hash_value = hash_functor(nword_hashes, N, 0);
std::bitset<64> first_bits(hash_value);
for (size_t i = 0; i < 64; ++i)
{
finger_vec[i] += ((first_bits.test(i)) ? 1 : -1);
}
size_t offset = 0;
while (start < end && read_one_word(word_buf, start, end))
{
// we need to store the new word hash value to the oldest location.
// for example, N = 5, array |a0|a1|a2|a3|a4|, now , a0 is the oldest location,
// so we need to store new word hash into location of a0, then ,this array become
// |a5|a1|a2|a3|a4|, next time, a1 become the oldest location, we need to store new
// word hash value into locaion of a1, then array become |a5|a6|a2|a3|a4|
nword_hashes[offset] = Hash::hashSum(word_buf.data(), word_buf.size());
offset = (offset + 1) % N;
// according to the word hash storation way, in order to not lose the word shingle's
// sequence information, when calculation word shingle hash value, we need provide the offset
// inforation, which is the offset of the first word's hash value of the word shingle
hash_value = hash_functor(nword_hashes, N, offset);
std::bitset<64> bits(hash_value);
for (size_t i = 0; i < 64; ++i)
{
finger_vec[i] += ((bits.test(i)) ? 1 : -1);
}
}
std::bitset<64> res_bit(0u);
for (size_t i = 0; i < 64; ++i)
{
if (finger_vec[i] > 0)
res_bit.set(i);
}
return res_bit.to_ullong();
}
static void apply(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray<UInt64> & res)
{
for (size_t i = 0; i < offsets.size(); ++i)
{
const char * one_data = reinterpret_cast<const char *>(&data[offsets[i - 1]]);
const size_t data_size = offsets[i] - offsets[i - 1] - 1;
if (data_size <= max_string_size)
{
if constexpr (Ngram)
{
if constexpr (!UTF8)
res[i] = ngramCalculateHashValue(one_data, data_size, StrOp::readASCIICodePoints, Hash::ngramASCIIHash);
else
res[i] = ngramCalculateHashValue(one_data, data_size, StrOp::readUTF8CodePoints, Hash::ngramUTF8Hash);
}
else
{
if constexpr (!UTF8)
res[i] = wordShinglesCalculateHashValue(one_data, data_size, StrOp::readOneASCIIWord, Hash::wordShinglesHash);
else
res[i] = wordShinglesCalculateHashValue(one_data, data_size, StrOp::readOneUTF8Word, Hash::wordShinglesHash);
}
}
else
res[i] = -1ull;
}
}
};
template <typename F, size_t K, size_t v>
class FixedHeap
{
public:
FixedHeap() = delete;
explicit FixedHeap(F f_) : f(f_), data_t(std::make_shared<std::vector<UInt64>>(K, v))
{
std::make_heap(data_t->begin(), data_t->end(), f);
}
void insertAndReplace(UInt64 new_v)
{
data_t->push_back(new_v);
std::push_heap(data_t->begin(), data_t->end(), f);
std::pop_heap(data_t->begin(), data_t->end(), f);
data_t->pop_back();
}
const UInt64 * data() { return data_t->data(); }
private:
F f;
std::shared_ptr<std::vector<UInt64>> data_t;
};
// Minhash: String -> Tuple(UInt64, UInt64)
// for each string, we extract ngram or word shingle,
// for each ngram or word shingle, calculate a hash value,
// then we take the K minimum hash values to calculate a hashsum,
// and take the K maximum hash values to calculate another hashsum,
// return this two hashsum: Tuple(hashsum1, hashsum2)
//
// N: the length of ngram or words shingles
// K: the number of minimum hashes and maximum hashes that we keep
// CodePoint: UInt8(ASCII) or UInt32(UTF8)
// UTF8: means ASCII or UTF8, these two parameters CodePoint and UTF8 can only be (UInt8, false) or (UInt32, true)
// Ngram: means ngram(true) or words shingles(false)
// CaseInsensitive: means should we consider about letter case or not
template <size_t N, size_t K, typename CodePoint, bool UTF8, bool Ngram, bool CaseInsensitive>
struct MinhashImpl
{
using Less = std::less<size_t>;
using Greater = std::greater<size_t>;
using MaxHeap = FixedHeap<std::less<size_t>, K, -1ULL>;
using MinHeap = FixedHeap<std::greater<size_t>, K, 0>;
using StrOp = ExtractStringImpl<N, CaseInsensitive>;
static constexpr size_t max_string_size = 1u << 15;
static constexpr size_t simultaneously_codepoints_num = StrOp::buffer_size;
// Minhash ngram calculate function, String -> Tuple(UInt64, UInt64)
// we extract ngram from input string, and calculate a hash value for each ngram
// then we take the K minimum hash values to calculate a hashsum,
// and take the K maximum hash values to calculate another hashsum,
// return this two hashsum: Tuple(hashsum1, hashsum2)
static ALWAYS_INLINE inline std::tuple<UInt64, UInt64> ngramCalculateHashValue(
const char * data,
size_t size,
size_t (*read_code_points)(CodePoint *, const char *&, const char *),
UInt64 (*hash_functor)(const CodePoint *))
{
const char * start = data;
const char * end = data + size;
// we just maintain the K minimu and K maximum hash values
MaxHeap k_minimum_hashes(Less{});
MinHeap k_maximum_hashes(Greater{});
CodePoint cp[simultaneously_codepoints_num] = {};
size_t found = read_code_points(cp, start, end);
size_t iter = N - 1;
do
{
for (; iter + N <= found; ++iter)
{
auto new_hash = hash_functor(cp + iter);
// insert the new hash value into array used to store K minimum value
// and K maximum value
k_minimum_hashes.insertAndReplace(new_hash);
k_maximum_hashes.insertAndReplace(new_hash);
}
iter = 0;
} while (start < end && (found = read_code_points(cp, start, end)));
// calculate hashsum of the K minimum hash values and K maximum hash values
UInt64 res1 = Hash::hashSum(k_minimum_hashes.data(), K);
UInt64 res2 = Hash::hashSum(k_maximum_hashes.data(), K);
return std::make_tuple(res1, res2);
}
// Minhash word shingle hash value calculate function: String ->Tuple(UInt64, UInt64)
// for each word shingle, we calculate a hash value, but in fact, we just maintain the
// K minimum and K maximum hash value
static ALWAYS_INLINE inline std::tuple<UInt64, UInt64> wordShinglesCalculateHashValue(
const char * data,
size_t size,
size_t (*read_one_word)(PaddedPODArray<CodePoint> &, const char *&, const char *),
UInt64 (*hash_functor)(const UInt64 *, size_t, size_t))
{
const char * start = data;
const char * end = start + size;
// also we just store the K minimu and K maximum hash values
MaxHeap k_minimum_hashes(Less{});
MinHeap k_maximum_hashes(Greater{});
// array to store n word hashes
UInt64 nword_hashes[N] = {};
// word buffer to store one word
PaddedPODArray<CodePoint> word_buf;
// how word shingle hash value calculation and word hash storation is same as we
// have descripted in Simhash wordShinglesCalculateHashValue function
for (size_t i = 0; i < N && start < end; ++i)
{
read_one_word(word_buf, start, end);
if (!word_buf.empty())
{
nword_hashes[i++] = Hash::hashSum(word_buf.data(), word_buf.size());
}
}
auto new_hash = hash_functor(nword_hashes, N, 0);
k_minimum_hashes.insertAndReplace(new_hash);
k_maximum_hashes.insertAndReplace(new_hash);
size_t offset = 0;
while (start < end && read_one_word(word_buf, start, end))
{
nword_hashes[offset] = Hash::hashSum(word_buf.data(), word_buf.size());
offset = (offset + 1) % N;
new_hash = hash_functor(nword_hashes, N, offset);
k_minimum_hashes.insertAndReplace(new_hash);
k_maximum_hashes.insertAndReplace(new_hash);
}
// calculate hashsum
UInt64 res1 = Hash::hashSum(k_minimum_hashes.data(), K);
UInt64 res2 = Hash::hashSum(k_maximum_hashes.data(), K);
return std::make_tuple(res1, res2);
}
static void apply(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
PaddedPODArray<UInt64> & res1,
PaddedPODArray<UInt64> & res2)
{
for (size_t i = 0; i < offsets.size(); ++i)
{
const char * one_data = reinterpret_cast<const char *>(&data[offsets[i - 1]]);
const size_t data_size = offsets[i] - offsets[i - 1] - 1;
if (data_size <= max_string_size)
{
if constexpr (Ngram)
{
if constexpr (!UTF8)
std::tie(res1[i], res2[i]) = ngramCalculateHashValue(one_data, data_size, StrOp::readASCIICodePoints, Hash::ngramASCIIHash);
else
std::tie(res1[i], res2[i]) = ngramCalculateHashValue(one_data, data_size, StrOp::readUTF8CodePoints, Hash::ngramUTF8Hash);
}
else
{
if constexpr (!UTF8)
std::tie(res1[i], res2[i]) = wordShinglesCalculateHashValue(one_data, data_size, StrOp::readOneASCIIWord, Hash::wordShinglesHash);
else
std::tie(res1[i], res2[i]) = wordShinglesCalculateHashValue(one_data, data_size, StrOp::readOneUTF8Word, Hash::wordShinglesHash);
}
}
else
std::tie(res1[i], res2[i]) = std::make_tuple(-1ull, -1ull);
}
}
};
struct NameNgramSimhash
{
static constexpr auto name = "ngramSimhash";
};
struct NameNgramSimhashCaseInsensitive
{
static constexpr auto name = "ngramSimhashCaseInsensitive";
};
struct NameNgramSimhashUTF8
{
static constexpr auto name = "ngramSimhashUTF8";
};
struct NameNgramSimhashCaseInsensitiveUTF8
{
static constexpr auto name = "ngramSimhashCaseInsensitiveUTF8";
};
struct NameWordShingleSimhash
{
static constexpr auto name = "wordShingleSimhash";
};
struct NameWordShingleSimhashCaseInsensitive
{
static constexpr auto name = "wordShingleSimhashCaseInsensitive";
};
struct NameWordShingleSimhashUTF8
{
static constexpr auto name = "wordShingleSimhashUTF8";
};
struct NameWordShingleSimhashCaseInsensitiveUTF8
{
static constexpr auto name = "wordShingleSimhashCaseInsensitiveUTF8";
};
struct NameNgramMinhash
{
static constexpr auto name = "ngramMinhash";
};
struct NameNgramMinhashCaseInsensitive
{
static constexpr auto name = "ngramMinhashCaseInsensitive";
};
struct NameNgramMinhashUTF8
{
static constexpr auto name = "ngramMinhashUTF8";
};
struct NameNgramMinhashCaseInsensitiveUTF8
{
static constexpr auto name = "ngramMinhashCaseInsensitiveUTF8";
};
struct NameWordShingleMinhash
{
static constexpr auto name = "wordShingleMinhash";
};
struct NameWordShingleMinhashCaseInsensitive
{
static constexpr auto name = "wordShingleMinhashCaseInsensitive";
};
struct NameWordShingleMinhashUTF8
{
static constexpr auto name = "wordShingleMinhashUTF8";
};
struct NameWordShingleMinhashCaseInsensitiveUTF8
{
static constexpr auto name = "wordShingleMinhashCaseInsensitiveUTF8";
};
// Simhash
using FunctionNgramSimhash = FunctionsStringHash<SimhashImpl<4, UInt8, false, true, false>, NameNgramSimhash, true>;
using FunctionNgramSimhashCaseInsensitive
= FunctionsStringHash<SimhashImpl<4, UInt8, false, true, true>, NameNgramSimhashCaseInsensitive, true>;
using FunctionNgramSimhashUTF8 = FunctionsStringHash<SimhashImpl<3, UInt32, true, true, false>, NameNgramSimhashUTF8, true>;
using FunctionNgramSimhashCaseInsensitiveUTF8
= FunctionsStringHash<SimhashImpl<3, UInt32, true, true, true>, NameNgramSimhashCaseInsensitiveUTF8, true>;
using FunctionWordShingleSimhash = FunctionsStringHash<SimhashImpl<3, UInt8, false, false, false>, NameWordShingleSimhash, true>;
using FunctionWordShingleSimhashCaseInsensitive
= FunctionsStringHash<SimhashImpl<3, UInt8, false, false, true>, NameWordShingleSimhashCaseInsensitive, true>;
using FunctionWordShingleSimhashUTF8 = FunctionsStringHash<SimhashImpl<3, UInt32, true, false, false>, NameWordShingleSimhashUTF8, true>;
using FunctionWordShingleSimhashCaseInsensitiveUTF8
= FunctionsStringHash<SimhashImpl<3, UInt32, true, false, true>, NameWordShingleSimhashCaseInsensitiveUTF8, true>;
// Minhash
using FunctionNgramMinhash = FunctionsStringHash<MinhashImpl<4, 6, UInt8, false, true, false>, NameNgramMinhash, false>;
using FunctionNgramMinhashCaseInsensitive
= FunctionsStringHash<MinhashImpl<4, 6, UInt8, false, true, true>, NameNgramMinhashCaseInsensitive, false>;
using FunctionNgramMinhashUTF8 = FunctionsStringHash<MinhashImpl<4, 6, UInt32, true, true, false>, NameNgramMinhashUTF8, false>;
using FunctionNgramMinhashCaseInsensitiveUTF8
= FunctionsStringHash<MinhashImpl<4, 6, UInt32, true, true, true>, NameNgramMinhashCaseInsensitiveUTF8, false>;
using FunctionWordShingleMinhash = FunctionsStringHash<MinhashImpl<3, 6, UInt8, false, false, false>, NameWordShingleMinhash, false>;
using FunctionWordShingleMinhashCaseInsensitive
= FunctionsStringHash<MinhashImpl<3, 6, UInt8, false, false, true>, NameWordShingleMinhashCaseInsensitive, false>;
using FunctionWordShingleMinhashUTF8
= FunctionsStringHash<MinhashImpl<3, 6, UInt32, true, false, false>, NameWordShingleMinhashUTF8, false>;
using FunctionWordShingleMinhashCaseInsensitiveUTF8
= FunctionsStringHash<MinhashImpl<3, 6, UInt32, true, false, true>, NameWordShingleMinhashCaseInsensitiveUTF8, false>;
void registerFunctionsStringHash(FunctionFactory & factory)
{
factory.registerFunction<FunctionNgramSimhash>();
factory.registerFunction<FunctionNgramSimhashCaseInsensitive>();
factory.registerFunction<FunctionNgramSimhashUTF8>();
factory.registerFunction<FunctionNgramSimhashCaseInsensitiveUTF8>();
factory.registerFunction<FunctionWordShingleSimhash>();
factory.registerFunction<FunctionWordShingleSimhashCaseInsensitive>();
factory.registerFunction<FunctionWordShingleSimhashUTF8>();
factory.registerFunction<FunctionWordShingleSimhashCaseInsensitiveUTF8>();
factory.registerFunction<FunctionNgramMinhash>();
factory.registerFunction<FunctionNgramMinhashCaseInsensitive>();
factory.registerFunction<FunctionNgramMinhashUTF8>();
factory.registerFunction<FunctionNgramMinhashCaseInsensitiveUTF8>();
factory.registerFunction<FunctionWordShingleMinhash>();
factory.registerFunction<FunctionWordShingleMinhashCaseInsensitive>();
factory.registerFunction<FunctionWordShingleMinhashUTF8>();
factory.registerFunction<FunctionWordShingleMinhashCaseInsensitiveUTF8>();
}
}

View File

@ -0,0 +1,83 @@
#pragma once
#include <Columns/ColumnConst.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnTuple.h>
#include <Columns/ColumnVector.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypesNumber.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunctionImpl.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
// FunctionStringHash
// Simhash: String -> UInt64
// Minhash: String -> (UInt64, UInt64)
template <typename Impl, typename Name, bool is_simhash>
class FunctionsStringHash : public IFunction
{
public:
static constexpr auto name = Name::name;
static FunctionPtr create(const Context &) { return std::make_shared<FunctionsStringHash>(); }
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 1; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (!isString(arguments[0]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Function {} expect single String argument, got {}", getName(), arguments[0]->getName());
auto type = std::make_shared<DataTypeUInt64>();
if constexpr (is_simhash)
return type;
return std::make_shared<DataTypeTuple>(DataTypes{type, type});
}
bool useDefaultImplementationForConstants() const override { return true; }
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override
{
const ColumnPtr & column = arguments[0].column;
if constexpr (is_simhash)
{
// non const string, const case is handled by useDefaultImplementationForConstants.
auto col_res = ColumnVector<UInt64>::create();
auto & vec_res = col_res->getData();
vec_res.resize(column->size());
const ColumnString * col_str_vector = checkAndGetColumn<ColumnString>(&*column);
Impl::apply(col_str_vector->getChars(), col_str_vector->getOffsets(), vec_res);
return col_res;
}
else // Min hash
{
// non const string
auto col_h1 = ColumnVector<UInt64>::create();
auto col_h2 = ColumnVector<UInt64>::create();
auto & vec_h1 = col_h1->getData();
auto & vec_h2 = col_h2->getData();
vec_h1.resize(column->size());
vec_h2.resize(column->size());
const ColumnString * col_str_vector = checkAndGetColumn<ColumnString>(&*column);
Impl::apply(col_str_vector->getChars(), col_str_vector->getOffsets(), vec_h1, vec_h2);
MutableColumns tuple_columns;
tuple_columns.emplace_back(std::move(col_h1));
tuple_columns.emplace_back(std::move(col_h2));
return ColumnTuple::create(std::move(tuple_columns));
}
}
};
}

View File

@ -0,0 +1,160 @@
#include <Columns/ColumnVector.h>
#include <DataTypes/DataTypesNumber.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
#include <Functions/castTypeToEither.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
template <typename A, typename B>
struct BitHammingDistanceImpl
{
using ResultType = UInt8;
static void NO_INLINE vectorVector(const PaddedPODArray<A> & a, const PaddedPODArray<B> & b, PaddedPODArray<ResultType> & c)
{
size_t size = a.size();
for (size_t i = 0; i < size; ++i)
c[i] = apply(a[i], b[i]);
}
static void NO_INLINE vectorConstant(const PaddedPODArray<A> & a, B b, PaddedPODArray<ResultType> & c)
{
size_t size = a.size();
for (size_t i = 0; i < size; ++i)
c[i] = apply(a[i], b);
}
static void NO_INLINE constantVector(A a, const PaddedPODArray<B> & b, PaddedPODArray<ResultType> & c)
{
size_t size = b.size();
for (size_t i = 0; i < size; ++i)
c[i] = apply(a, b[i]);
}
private:
static inline UInt8 apply(UInt64 a, UInt64 b)
{
UInt64 res = a ^ b;
return __builtin_popcountll(res);
}
};
template <typename F>
bool castType(const IDataType * type, F && f)
{
return castTypeToEither<
DataTypeInt8,
DataTypeInt16,
DataTypeInt32,
DataTypeInt64,
DataTypeUInt8,
DataTypeUInt16,
DataTypeUInt32,
DataTypeUInt64>(type, std::forward<F>(f));
}
template <typename F>
static bool castBothTypes(const IDataType * left, const IDataType * right, F && f)
{
return castType(left, [&](const auto & left_) { return castType(right, [&](const auto & right_) { return f(left_, right_); }); });
}
// bitHammingDistance function: (Integer, Integer) -> UInt8
class FunctionBitHammingDistance : public IFunction
{
public:
static constexpr auto name = "bitHammingDistance";
using ResultType = UInt8;
static FunctionPtr create(const Context &) { return std::make_shared<FunctionBitHammingDistance>(); }
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 2; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (!isInteger(arguments[0]))
throw Exception(
"Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
if (!isInteger(arguments[1]))
throw Exception(
"Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return std::make_shared<DataTypeUInt8>();
}
bool useDefaultImplementationForConstants() const override { return true; }
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
{
const auto * left_generic = arguments[0].type.get();
const auto * right_generic = arguments[1].type.get();
ColumnPtr result_column;
bool valid = castBothTypes(left_generic, right_generic, [&](const auto & left, const auto & right)
{
using LeftDataType = std::decay_t<decltype(left)>;
using RightDataType = std::decay_t<decltype(right)>;
using T0 = typename LeftDataType::FieldType;
using T1 = typename RightDataType::FieldType;
using ColVecT0 = ColumnVector<T0>;
using ColVecT1 = ColumnVector<T1>;
using ColVecResult = ColumnVector<ResultType>;
using OpImpl = BitHammingDistanceImpl<T0, T1>;
const auto * const col_left_raw = arguments[0].column.get();
const auto * const col_right_raw = arguments[1].column.get();
typename ColVecResult::MutablePtr col_res = nullptr;
col_res = ColVecResult::create();
auto & vec_res = col_res->getData();
vec_res.resize(input_rows_count);
if (auto col_left_const = checkAndGetColumnConst<ColVecT0>(col_left_raw))
{
if (auto col_right = checkAndGetColumn<ColVecT1>(col_right_raw))
{
// constant integer - non-constant integer
OpImpl::constantVector(col_left_const->template getValue<T0>(), col_right->getData(), vec_res);
}
else
return false;
}
else if (auto col_left = checkAndGetColumn<ColVecT0>(col_left_raw))
{
if (auto col_right = checkAndGetColumn<ColVecT1>(col_right_raw))
// non-constant integer - non-constant integer
OpImpl::vectorVector(col_left->getData(), col_right->getData(), vec_res);
else if (auto col_right_const = checkAndGetColumnConst<ColVecT1>(col_right_raw))
// non-constant integer - constant integer
OpImpl::vectorConstant(col_left->getData(), col_right_const->template getValue<T1>(), vec_res);
else
return false;
}
else
return false;
result_column = std::move(col_res);
return true;
});
if (!valid)
throw Exception(getName() + "'s arguments do not match the expected data types", ErrorCodes::ILLEGAL_COLUMN);
return result_column;
}
};
void registerFunctionBitHammingDistance(FunctionFactory & factory)
{
factory.registerFunction<FunctionBitHammingDistance>();
}
}

View File

@ -42,7 +42,9 @@ void registerFunctionsNull(FunctionFactory &);
void registerFunctionsJSON(FunctionFactory &);
void registerFunctionsConsistentHashing(FunctionFactory & factory);
void registerFunctionsUnixTimestamp64(FunctionFactory & factory);
void registerFunctionBitHammingDistance(FunctionFactory & factory);
void registerFunctionTupleHammingDistance(FunctionFactory & factory);
void registerFunctionsStringHash(FunctionFactory & factory);
#if !defined(ARCADIA_BUILD)
void registerFunctionBayesAB(FunctionFactory &);
#endif
@ -57,7 +59,6 @@ void registerFunctionAESDecryptMysql(FunctionFactory & factory);
#endif
void registerFunctions()
{
auto & factory = FunctionFactory::instance();
@ -99,6 +100,9 @@ void registerFunctions()
registerFunctionsIntrospection(factory);
registerFunctionsConsistentHashing(factory);
registerFunctionsUnixTimestamp64(factory);
registerFunctionBitHammingDistance(factory);
registerFunctionTupleHammingDistance(factory);
registerFunctionsStringHash(factory);
#if !defined(ARCADIA_BUILD)
registerFunctionBayesAB(factory);

View File

@ -0,0 +1,220 @@
#include <Columns/ColumnTuple.h>
#include <Columns/ColumnVector.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypesNumber.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
#include <Functions/castTypeToEither.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
template <typename A, typename B>
struct TupleHammingDistanceImpl
{
using ResultType = UInt8;
static void NO_INLINE vectorVector(
const PaddedPODArray<A> & a1,
const PaddedPODArray<A> & b1,
const PaddedPODArray<B> & a2,
const PaddedPODArray<B> & b2,
PaddedPODArray<ResultType> & c)
{
size_t size = a1.size();
for (size_t i = 0; i < size; ++i)
c[i] = apply(a1[i], a2[i]) + apply(b1[i], b2[i]);
}
static void NO_INLINE
vectorConstant(const PaddedPODArray<A> & a1, const PaddedPODArray<A> & b1, UInt64 a2, UInt64 b2, PaddedPODArray<ResultType> & c)
{
size_t size = a1.size();
for (size_t i = 0; i < size; ++i)
c[i] = apply(a1[i], a2) + apply(b1[i], b2);
}
static void NO_INLINE
constantVector(UInt64 a1, UInt64 b1, const PaddedPODArray<B> & a2, const PaddedPODArray<B> & b2, PaddedPODArray<ResultType> & c)
{
size_t size = a2.size();
for (size_t i = 0; i < size; ++i)
c[i] = apply(a1, a2[i]) + apply(b1, b2[i]);
}
static ResultType constantConstant(UInt64 a1, UInt64 b1, UInt64 a2, UInt64 b2) { return apply(a1, a2) + apply(b1, b2); }
private:
static inline UInt8 apply(UInt64 a, UInt64 b) { return a != b; }
};
template <typename F>
bool castType(const IDataType * type, F && f)
{
return castTypeToEither<
DataTypeInt8,
DataTypeInt16,
DataTypeInt32,
DataTypeInt64,
DataTypeUInt8,
DataTypeUInt16,
DataTypeUInt32,
DataTypeUInt64>(type, std::forward<F>(f));
}
template <typename F>
static bool castBothTypes(const IDataType * left, const IDataType * right, F && f)
{
return castType(left, [&](const auto & left_) { return castType(right, [&](const auto & right_) { return f(left_, right_); }); });
}
// tupleHammingDistance function: (Tuple(Integer, Integer), Tuple(Integer, Integer))->0/1/2
// in order to avoid code bloating, for non-constant tuple, we make sure that the elements
// in the tuple should have same data type, and for constant tuple, elements can be any integer
// data type, we cast all of them into UInt64
class FunctionTupleHammingDistance : public IFunction
{
public:
static constexpr auto name = "tupleHammingDistance";
using ResultType = UInt8;
static FunctionPtr create(const Context &) { return std::make_shared<FunctionTupleHammingDistance>(); }
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 2; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (!isTuple(arguments[0]))
throw Exception(
"Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
if (!isTuple(arguments[1]))
throw Exception(
"Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return std::make_shared<DataTypeUInt8>();
}
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
{
const ColumnWithTypeAndName & arg1 = arguments[0];
const ColumnWithTypeAndName & arg2 = arguments[1];
const DataTypeTuple & type1 = static_cast<const DataTypeTuple &>(*arg1.type);
const DataTypeTuple & type2 = static_cast<const DataTypeTuple &>(*arg2.type);
const auto & left_elems = type1.getElements();
const auto & right_elems = type2.getElements();
if (left_elems.size() != 2 || right_elems.size() != 2)
throw Exception(
"Illegal column of arguments of function " + getName() + ", tuple should have exactly two elements.",
ErrorCodes::ILLEGAL_COLUMN);
ColumnPtr result_column;
bool valid = castBothTypes(left_elems[0].get(), right_elems[0].get(), [&](const auto & left, const auto & right)
{
using LeftDataType = std::decay_t<decltype(left)>;
using RightDataType = std::decay_t<decltype(right)>;
using T0 = typename LeftDataType::FieldType;
using T1 = typename RightDataType::FieldType;
using ColVecT0 = ColumnVector<T0>;
using ColVecT1 = ColumnVector<T1>;
using ColVecResult = ColumnVector<ResultType>;
using OpImpl = TupleHammingDistanceImpl<T0, T1>;
// we can not useDefaultImplementationForConstants,
// because with that, tupleHammingDistance((10, 300), (10, 20)) does not work,
// since 10 has data type UInt8, and 300 has data type UInt16
if (const ColumnConst * const_col_left = checkAndGetColumnConst<ColumnTuple>(arg1.column.get()))
{
if (const ColumnConst * const_col_right = checkAndGetColumnConst<ColumnTuple>(arg2.column.get()))
{
auto cols1 = convertConstTupleToConstantElements(*const_col_left);
auto cols2 = convertConstTupleToConstantElements(*const_col_right);
Field a1, b1, a2, b2;
cols1[0]->get(0, a1);
cols1[1]->get(0, b1);
cols2[0]->get(0, a2);
cols2[1]->get(0, b2);
auto res = OpImpl::constantConstant(a1.get<UInt64>(), b1.get<UInt64>(), a2.get<UInt64>(), b2.get<UInt64>());
result_column = DataTypeUInt8().createColumnConst(const_col_left->size(), toField(res));
return true;
}
}
typename ColVecResult::MutablePtr col_res = nullptr;
col_res = ColVecResult::create();
auto & vec_res = col_res->getData();
vec_res.resize(input_rows_count);
// constant tuple - non-constant tuple
if (const ColumnConst * const_col_left = checkAndGetColumnConst<ColumnTuple>(arg1.column.get()))
{
if (const ColumnTuple * col_right = typeid_cast<const ColumnTuple *>(arg2.column.get()))
{
auto const_cols = convertConstTupleToConstantElements(*const_col_left);
Field a1, b1;
const_cols[0]->get(0, a1);
const_cols[1]->get(0, b1);
auto col_r1 = checkAndGetColumn<ColVecT1>(&col_right->getColumn(0));
auto col_r2 = checkAndGetColumn<ColVecT1>(&col_right->getColumn(1));
if (col_r1 && col_r2)
OpImpl::constantVector(a1.get<UInt64>(), b1.get<UInt64>(), col_r1->getData(), col_r2->getData(), vec_res);
else
return false;
}
else
return false;
}
else if (const ColumnTuple * col_left = typeid_cast<const ColumnTuple *>(arg1.column.get()))
{
auto col_l1 = checkAndGetColumn<ColVecT0>(&col_left->getColumn(0));
auto col_l2 = checkAndGetColumn<ColVecT0>(&col_left->getColumn(1));
if (col_l1 && col_l2)
{
// non-constant tuple - constant tuple
if (const ColumnConst * const_col_right = checkAndGetColumnConst<ColumnTuple>(arg2.column.get()))
{
auto const_cols = convertConstTupleToConstantElements(*const_col_right);
Field a2, b2;
const_cols[0]->get(0, a2);
const_cols[1]->get(0, b2);
OpImpl::vectorConstant(col_l1->getData(), col_l2->getData(), a2.get<UInt64>(), a2.get<UInt64>(), vec_res);
}
// non-constant tuple - non-constant tuple
else if (const ColumnTuple * col_right = typeid_cast<const ColumnTuple *>(arg2.column.get()))
{
auto col_r1 = checkAndGetColumn<ColVecT1>(&col_right->getColumn(0));
auto col_r2 = checkAndGetColumn<ColVecT1>(&col_right->getColumn(1));
if (col_r1 && col_r2)
OpImpl::vectorVector(col_l1->getData(), col_l2->getData(), col_r1->getData(), col_r2->getData(), vec_res);
else
return false;
}
else
return false;
}
else
return false;
}
else
return false;
result_column = std::move(col_res);
return true;
});
if (!valid)
throw Exception(getName() + "'s arguments do not match the expected data types", ErrorCodes::ILLEGAL_COLUMN);
return result_column;
}
};
void registerFunctionTupleHammingDistance(FunctionFactory & factory)
{
factory.registerFunction<FunctionTupleHammingDistance>();
}
}

View File

@ -53,6 +53,7 @@ SRCS(
FunctionsRandom.cpp
FunctionsRound.cpp
FunctionsStringArray.cpp
FunctionsStringHash.cpp
FunctionsStringSimilarity.cpp
GatherUtils/concat.cpp
GatherUtils/createArraySink.cpp
@ -185,6 +186,7 @@ SRCS(
bitBoolMaskAnd.cpp
bitBoolMaskOr.cpp
bitCount.cpp
bitHammingDistance.cpp
bitNot.cpp
bitOr.cpp
bitRotateLeft.cpp
@ -504,6 +506,7 @@ SRCS(
tryBase64Decode.cpp
tuple.cpp
tupleElement.cpp
tupleHammingDistance.cpp
upper.cpp
upperUTF8.cpp
uptime.cpp

View File

@ -28,10 +28,23 @@ bool ReadBufferFromPocoSocket::nextImpl()
ssize_t bytes_read = 0;
Stopwatch watch;
int flags = 0;
if (async_callback)
flags |= MSG_DONTWAIT;
/// Add more details to exceptions.
try
{
bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size());
bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size(), flags);
/// If async_callback is specified, and read is blocking, run async_callback and try again later.
/// It is expected that file descriptor may be polled externally.
/// Note that receive timeout is not checked here. External code should check it while polling.
while (bytes_read < 0 && async_callback && errno == EAGAIN)
{
async_callback(socket);
bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size(), flags);
}
}
catch (const Poco::Net::NetException & e)
{

View File

@ -5,7 +5,6 @@
#include <IO/ReadBuffer.h>
#include <IO/BufferWithOwnMemory.h>
namespace DB
{
@ -28,6 +27,11 @@ public:
ReadBufferFromPocoSocket(Poco::Net::Socket & socket_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE);
bool poll(size_t timeout_microseconds);
void setAsyncCallback(std::function<void(Poco::Net::Socket &)> async_callback_) { async_callback = std::move(async_callback_); }
private:
std::function<void(Poco::Net::Socket &)> async_callback;
};
}

View File

@ -624,7 +624,7 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions(
{
auto & input = inputs[res_elem.name];
if (input.empty())
throw Exception("Cannot find column " + backQuoteIfNeed(res_elem.name) + " in source stream",
throw Exception("Cannot find column " + backQuote(res_elem.name) + " in source stream",
ErrorCodes::THERE_IS_NO_COLUMN);
src_node = actions_dag->inputs[input.front()];
@ -641,12 +641,12 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions(
if (ignore_constant_values)
src_node = const_cast<Node *>(&actions_dag->addColumn(res_elem, true));
else if (res_const->getField() != src_const->getField())
throw Exception("Cannot convert column " + backQuoteIfNeed(res_elem.name) + " because "
throw Exception("Cannot convert column " + backQuote(res_elem.name) + " because "
"it is constant but values of constants are different in source and result",
ErrorCodes::ILLEGAL_COLUMN);
}
else
throw Exception("Cannot convert column " + backQuoteIfNeed(res_elem.name) + " because "
throw Exception("Cannot convert column " + backQuote(res_elem.name) + " because "
"it is non constant in source stream but must be constant in result",
ErrorCodes::ILLEGAL_COLUMN);
}

View File

@ -735,6 +735,28 @@ void ActionsMatcher::visit(const ASTFunction & node, const ASTPtr & ast, Data &
}
}
if (node.is_window_function)
{
// Also add columns from PARTITION BY and ORDER BY of window functions.
// Requiring a constant reference to a shared pointer to non-const AST
// doesn't really look sane, but the visitor does indeed require it.
if (node.window_partition_by)
{
visit(node.window_partition_by->clone(), data);
}
if (node.window_order_by)
{
visit(node.window_order_by->clone(), data);
}
// Don't need to do anything more for window functions here -- the
// resulting column is added in ExpressionAnalyzer, similar to the
// aggregate functions.
return;
}
// An aggregate function can also be calculated as a window function, but we
// checked for it above, so no need to do anything more.
if (AggregateFunctionFactory::instance().isAggregateFunctionName(node.name))
return;

View File

@ -1,6 +1,7 @@
#include <Interpreters/AggregateDescription.h>
#include <Common/FieldVisitors.h>
#include <IO/Operators.h>
#include <Parsers/ASTFunction.h>
namespace DB
{
@ -99,4 +100,31 @@ void AggregateDescription::explain(WriteBuffer & out, size_t indent) const
}
}
std::string WindowFunctionDescription::dump() const
{
WriteBufferFromOwnString ss;
ss << "window function '" << column_name << "\n";
ss << "function node " << function_node->dumpTree() << "\n";
ss << "aggregate function '" << aggregate_function->getName() << "'\n";
if (!function_parameters.empty())
{
ss << "parameters " << toString(function_parameters) << "\n";
}
return ss.str();
}
std::string WindowDescription::dump() const
{
WriteBufferFromOwnString ss;
ss << "window '" << window_name << "'\n";
ss << "partition_by " << dumpSortDescription(partition_by) << "\n";
ss << "order_by " << dumpSortDescription(order_by) << "\n";
ss << "full_sort_description " << dumpSortDescription(full_sort_description) << "\n";
return ss.str();
}
}

View File

@ -1,13 +1,18 @@
#pragma once
#include <AggregateFunctions/IAggregateFunction.h>
#include <DataTypes/IDataType.h>
#include <Core/ColumnNumbers.h>
#include <Core/Names.h>
#include <AggregateFunctions/IAggregateFunction.h>
#include <Core/SortDescription.h>
#include <Parsers/IAST_fwd.h>
namespace DB
{
class ASTFunction;
struct AggregateDescription
{
AggregateFunctionPtr function;
@ -21,4 +26,44 @@ struct AggregateDescription
using AggregateDescriptions = std::vector<AggregateDescription>;
struct WindowFunctionDescription
{
std::string column_name;
const ASTFunction * function_node;
AggregateFunctionPtr aggregate_function;
Array function_parameters;
DataTypes argument_types;
Names argument_names;
std::string dump() const;
};
struct WindowDescription
{
std::string window_name;
// We don't care about the particular order of keys for PARTITION BY, only
// that they are sorted. For now we always require ASC, but we could be more
// flexible and match any direction, or even different order of columns.
SortDescription partition_by;
SortDescription order_by;
// To calculate the window function, we sort input data first by PARTITION BY,
// then by ORDER BY. This field holds this combined sort order.
SortDescription full_sort_description;
// No frame info as of yet.
// The window functions that are calculated for this window.
std::vector<WindowFunctionDescription> window_functions;
std::string dump() const;
};
using WindowFunctionDescriptions = std::vector<WindowFunctionDescription>;
using WindowDescriptions = std::unordered_map<std::string, WindowDescription>;
}

View File

@ -212,18 +212,18 @@ void AsynchronousMetrics::update()
{
Int64 amount = total_memory_tracker.get();
Int64 peak = total_memory_tracker.getPeak();
Int64 new_peak = data.resident;
Int64 new_amount = data.resident;
LOG_DEBUG(&Poco::Logger::get("AsynchronousMetrics"),
"MemoryTracking: was {}, peak {}, will set to {} (RSS), difference: {}",
ReadableSize(amount),
ReadableSize(peak),
ReadableSize(new_peak),
ReadableSize(new_peak - peak)
ReadableSize(new_amount),
ReadableSize(new_amount - amount)
);
total_memory_tracker.set(new_peak);
CurrentMetrics::set(CurrentMetrics::MemoryTracking, new_peak);
total_memory_tracker.set(new_amount);
CurrentMetrics::set(CurrentMetrics::MemoryTracking, new_amount);
}
}
#endif

View File

@ -126,6 +126,7 @@ void SelectStreamFactory::createForShard(
bool add_agg_info = processed_stage == QueryProcessingStage::WithMergeableState;
bool add_totals = false;
bool add_extremes = false;
bool async_read = context_ptr->getSettingsRef().async_socket_for_remote;
if (processed_stage == QueryProcessingStage::Complete)
{
add_totals = query_ast->as<ASTSelectQuery &>().group_by_with_totals;
@ -153,7 +154,7 @@ void SelectStreamFactory::createForShard(
if (!table_func_ptr)
remote_query_executor->setMainTable(main_table);
remote_pipes.emplace_back(createRemoteSourcePipe(remote_query_executor, add_agg_info, add_totals, add_extremes));
remote_pipes.emplace_back(createRemoteSourcePipe(remote_query_executor, add_agg_info, add_totals, add_extremes, async_read));
remote_pipes.back().addInterpreterContext(context_ptr);
};
@ -249,7 +250,7 @@ void SelectStreamFactory::createForShard(
pool = shard_info.pool, shard_num = shard_info.shard_num, modified_query, header = header, modified_query_ast,
&context, context_ptr, throttler,
main_table = main_table, table_func_ptr = table_func_ptr, scalars = scalars, external_tables = external_tables,
stage = processed_stage, local_delay, add_agg_info, add_totals, add_extremes]()
stage = processed_stage, local_delay, add_agg_info, add_totals, add_extremes, async_read]()
-> Pipe
{
auto current_settings = context.getSettingsRef();
@ -295,7 +296,7 @@ void SelectStreamFactory::createForShard(
auto remote_query_executor = std::make_shared<RemoteQueryExecutor>(
std::move(connections), modified_query, header, context, throttler, scalars, external_tables, stage);
return createRemoteSourcePipe(remote_query_executor, add_agg_info, add_totals, add_extremes);
return createRemoteSourcePipe(remote_query_executor, add_agg_info, add_totals, add_extremes, async_read);
}
};

View File

@ -50,6 +50,9 @@
#include <Interpreters/GlobalSubqueriesVisitor.h>
#include <Interpreters/GetAggregatesVisitor.h>
#include <IO/Operators.h>
#include <IO/WriteBufferFromString.h>
namespace DB
{
@ -58,12 +61,14 @@ using LogAST = DebugASTLog<false>; /// set to true to enable logs
namespace ErrorCodes
{
extern const int UNKNOWN_TYPE_OF_AST_NODE;
extern const int UNKNOWN_IDENTIFIER;
extern const int BAD_ARGUMENTS;
extern const int ILLEGAL_PREWHERE;
extern const int LOGICAL_ERROR;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER;
extern const int LOGICAL_ERROR;
extern const int NOT_IMPLEMENTED;
extern const int UNKNOWN_IDENTIFIER;
extern const int UNKNOWN_TYPE_OF_AST_NODE;
}
namespace
@ -283,6 +288,8 @@ void ExpressionAnalyzer::analyzeAggregation()
{
aggregated_columns = temp_actions->getNamesAndTypesList();
}
has_window = makeWindowDescriptions(temp_actions);
}
@ -444,7 +451,11 @@ bool ExpressionAnalyzer::makeAggregateDescriptions(ActionsDAGPtr & actions)
auto it = index.find(name);
if (it == index.end())
throw Exception(ErrorCodes::UNKNOWN_IDENTIFIER, "Unknown identifier (in aggregate function '{}'): {}", node->name, name);
{
throw Exception(ErrorCodes::UNKNOWN_IDENTIFIER,
"Unknown identifier '{}' in aggregate function '{}'",
name, node->formatForErrorMessage());
}
types[i] = (*it)->result_type;
aggregate.argument_names[i] = name;
@ -461,6 +472,128 @@ bool ExpressionAnalyzer::makeAggregateDescriptions(ActionsDAGPtr & actions)
}
bool ExpressionAnalyzer::makeWindowDescriptions(ActionsDAGPtr & actions)
{
// Convenient to check here because at least we have the Context.
if (!syntax->window_function_asts.empty() &&
!context.getSettingsRef().allow_experimental_window_functions)
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
"Window functions are not implemented (while processing '{}')",
syntax->window_function_asts[0]->formatForErrorMessage());
}
for (const ASTFunction * function_node : syntax->window_function_asts)
{
assert(function_node->is_window_function);
WindowDescription window_description;
window_description.window_name = function_node->getWindowDescription();
if (function_node->window_partition_by)
{
for (const auto & column_ast
: function_node->window_partition_by->children)
{
const auto * with_alias = dynamic_cast<const ASTWithAlias *>(
column_ast.get());
if (!with_alias)
{
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Expected a column in PARTITION BY for window '{}',"
" got '{}'", window_description.window_name,
column_ast->formatForErrorMessage());
}
window_description.partition_by.push_back(
SortColumnDescription(
with_alias->getColumnName(), 1 /* direction */,
1 /* nulls_direction */));
}
}
if (function_node->window_order_by)
{
for (const auto & column_ast
: function_node->window_order_by->children)
{
// Parser should have checked that we have a proper element here.
const auto & order_by_element
= column_ast->as<ASTOrderByElement &>();
// Ignore collation for now.
window_description.order_by.push_back(
SortColumnDescription(
order_by_element.children.front()->getColumnName(),
order_by_element.direction,
order_by_element.nulls_direction));
}
}
window_description.full_sort_description = window_description.partition_by;
window_description.full_sort_description.insert(
window_description.full_sort_description.end(),
window_description.order_by.begin(),
window_description.order_by.end());
WindowFunctionDescription window_function;
window_function.function_node = function_node;
window_function.column_name
= window_function.function_node->getColumnName();
window_function.function_parameters
= window_function.function_node->parameters
? getAggregateFunctionParametersArray(
window_function.function_node->parameters)
: Array();
// Requiring a constant reference to a shared pointer to non-const AST
// doesn't really look sane, but the visitor does indeed require it.
// Hence we clone the node (not very sane either, I know).
getRootActionsNoMakeSet(window_function.function_node->clone(),
true, actions);
const ASTs & arguments
= window_function.function_node->arguments->children;
window_function.argument_types.resize(arguments.size());
window_function.argument_names.resize(arguments.size());
const auto & index = actions->getIndex();
for (size_t i = 0; i < arguments.size(); ++i)
{
const std::string & name = arguments[i]->getColumnName();
auto it = index.find(name);
if (it == index.end())
{
throw Exception(ErrorCodes::UNKNOWN_IDENTIFIER,
"Unknown identifier '{}' in window function '{}'",
name, window_function.function_node->formatForErrorMessage());
}
window_function.argument_types[i] = (*it)->result_type;
window_function.argument_names[i] = name;
}
AggregateFunctionProperties properties;
window_function.aggregate_function
= AggregateFunctionFactory::instance().get(
window_function.function_node->name,
window_function.argument_types,
window_function.function_parameters, properties);
auto [it, inserted] = window_descriptions.insert(
{window_description.window_name, window_description});
if (!inserted)
{
assert(it->second.full_sort_description
== window_description.full_sort_description);
}
it->second.window_functions.push_back(window_function);
}
return !syntax->window_function_asts.empty();
}
const ASTSelectQuery * ExpressionAnalyzer::getSelectQuery() const
{
const auto * select_query = query->as<ASTSelectQuery>();
@ -831,6 +964,65 @@ void SelectQueryExpressionAnalyzer::appendAggregateFunctionsArguments(Expression
getRootActions(argument, only_types, step.actions());
}
void SelectQueryExpressionAnalyzer::appendWindowFunctionsArguments(
ExpressionActionsChain & chain, bool /* only_types */)
{
ExpressionActionsChain::Step & step = chain.lastStep(aggregated_columns);
// 1) Add actions for window functions and their arguments;
// 2) Mark the columns that are really required.
for (const auto & [_, w] : window_descriptions)
{
for (const auto & f : w.window_functions)
{
// 1.1) arguments of window functions;
// Requiring a constant reference to a shared pointer to non-const AST
// doesn't really look sane, but the visitor does indeed require it.
getRootActionsNoMakeSet(f.function_node->clone(),
true /* no_subqueries */, step.actions());
// 1.2) result of window function: an empty INPUT.
// It is an aggregate function, so it won't be added by getRootActions.
// This is something of a hack. Other options:
// a] do it like aggregate function -- break the chain of actions
// and manually add window functions to the starting list of
// input columns. Logically this is similar to what we're doing
// now, but would require to split the window function processing
// into a full-fledged step after plain functions. This would be
// somewhat cumbersome. With INPUT hack we can avoid a separate
// step and pretend that window functions are almost "normal"
// select functions. The limitation of both these ways is that
// we can't reference window functions in other SELECT
// expressions.
// b] add a WINDOW action type, then sort, then split the chain on
// each WINDOW action and insert the Window pipeline between the
// Expression pipelines. This is a "proper" way that would allow
// us to depend on window functions in other functions. But it's
// complicated so I avoid doing it for now.
ColumnWithTypeAndName col;
col.type = f.aggregate_function->getReturnType();
col.column = col.type->createColumn();
col.name = f.column_name;
step.actions()->addInput(col);
for (const auto & a : f.function_node->arguments->children)
{
// 2.1) function arguments;
step.required_output.push_back(a->getColumnName());
}
// 2.2) function result;
step.required_output.push_back(f.column_name);
}
// 2.3) PARTITION BY and ORDER BY columns.
for (const auto & c : w.full_sort_description)
{
step.required_output.push_back(c.column_name);
}
}
}
bool SelectQueryExpressionAnalyzer::appendHaving(ExpressionActionsChain & chain, bool only_types)
{
const auto * select_query = getAggregatingQuery();
@ -855,8 +1047,10 @@ void SelectQueryExpressionAnalyzer::appendSelect(ExpressionActionsChain & chain,
getRootActions(select_query->select(), only_types, step.actions());
for (const auto & child : select_query->select()->children)
{
step.required_output.push_back(child->getColumnName());
}
}
ActionsDAGPtr SelectQueryExpressionAnalyzer::appendOrderBy(ExpressionActionsChain & chain, bool only_types, bool optimize_read_in_order,
ManyExpressionActions & order_by_elements_actions)
@ -1076,6 +1270,7 @@ ExpressionAnalysisResult::ExpressionAnalysisResult(
: first_stage(first_stage_)
, second_stage(second_stage_)
, need_aggregate(query_analyzer.hasAggregation())
, has_window(query_analyzer.hasWindow())
{
/// first_stage: Do I need to perform the first part of the pipeline - running on remote servers during distributed processing.
/// second_stage: Do I need to execute the second part of the pipeline - running on the initiating server during distributed processing.
@ -1225,6 +1420,9 @@ ExpressionAnalysisResult::ExpressionAnalysisResult(
/// If there is aggregation, we execute expressions in SELECT and ORDER BY on the initiating server, otherwise on the source servers.
query_analyzer.appendSelect(chain, only_types || (need_aggregate ? !second_stage : !first_stage));
query_analyzer.appendWindowFunctionsArguments(chain, only_types || !first_stage);
selected_columns = chain.getLastStep().required_output;
has_order_by = query.orderBy() != nullptr;
before_order_and_select = query_analyzer.appendOrderBy(
@ -1321,4 +1519,75 @@ void ExpressionAnalysisResult::checkActions() const
}
}
std::string ExpressionAnalysisResult::dump() const
{
WriteBufferFromOwnString ss;
ss << "need_aggregate " << need_aggregate << "\n";
ss << "has_order_by " << has_order_by << "\n";
ss << "has_window " << has_window << "\n";
if (before_array_join)
{
ss << "before_array_join " << before_array_join->dumpDAG() << "\n";
}
if (array_join)
{
ss << "array_join " << "FIXME doesn't have dump" << "\n";
}
if (before_join)
{
ss << "before_join " << before_join->dumpDAG() << "\n";
}
if (before_where)
{
ss << "before_where " << before_where->dumpDAG() << "\n";
}
if (prewhere_info)
{
ss << "prewhere_info " << prewhere_info->dump() << "\n";
}
if (filter_info)
{
ss << "filter_info " << filter_info->dump() << "\n";
}
if (before_aggregation)
{
ss << "before_aggregation " << before_aggregation->dumpDAG() << "\n";
}
if (before_having)
{
ss << "before_having " << before_having->dumpDAG() << "\n";
}
if (before_window)
{
ss << "before_window " << before_window->dumpDAG() << "\n";
}
if (before_order_and_select)
{
ss << "before_order_and_select " << before_order_and_select->dumpDAG() << "\n";
}
if (before_limit_by)
{
ss << "before_limit_by " << before_limit_by->dumpDAG() << "\n";
}
if (final_projection)
{
ss << "final_projection " << final_projection->dumpDAG() << "\n";
}
return ss.str();
}
}

View File

@ -60,6 +60,10 @@ struct ExpressionAnalyzerData
NamesAndTypesList aggregation_keys;
AggregateDescriptions aggregate_descriptions;
bool has_window = false;
WindowDescriptions window_descriptions;
NamesAndTypesList window_columns;
bool has_global_subqueries = false;
/// All new temporary tables obtained by performing the GLOBAL IN/JOIN subqueries.
@ -116,6 +120,9 @@ public:
/// Get intermediates for tests
const ExpressionAnalyzerData & getAnalyzedData() const { return *this; }
/// A list of windows for window functions.
const WindowDescriptions & windowDescriptions() const { return window_descriptions; }
protected:
ExpressionAnalyzer(
const ASTPtr & query_,
@ -159,6 +166,8 @@ protected:
void analyzeAggregation();
bool makeAggregateDescriptions(ActionsDAGPtr & actions);
bool makeWindowDescriptions(ActionsDAGPtr & actions);
const ASTSelectQuery * getSelectQuery() const;
bool isRemoteStorage() const { return syntax->is_remote_storage; }
@ -169,6 +178,8 @@ class SelectQueryExpressionAnalyzer;
/// Result of SelectQueryExpressionAnalyzer: expressions for InterpreterSelectQuery
struct ExpressionAnalysisResult
{
std::string dump() const;
/// Do I need to perform the first part of the pipeline - running on remote servers during distributed processing.
bool first_stage = false;
/// Do I need to execute the second part of the pipeline - running on the initiating server during distributed processing.
@ -176,6 +187,7 @@ struct ExpressionAnalysisResult
bool need_aggregate = false;
bool has_order_by = false;
bool has_window = false;
bool remove_where_filter = false;
bool optimize_read_in_order = false;
@ -189,6 +201,7 @@ struct ExpressionAnalysisResult
ActionsDAGPtr before_where;
ActionsDAGPtr before_aggregation;
ActionsDAGPtr before_having;
ActionsDAGPtr before_window;
ActionsDAGPtr before_order_and_select;
ActionsDAGPtr before_limit_by;
ActionsDAGPtr final_projection;
@ -256,6 +269,7 @@ public:
/// Does the expression have aggregate functions or a GROUP BY or HAVING section.
bool hasAggregation() const { return has_aggregation; }
bool hasWindow() const { return has_window; }
bool hasGlobalSubqueries() { return has_global_subqueries; }
bool hasTableJoin() const { return syntax->ast_join; }
@ -326,6 +340,7 @@ private:
bool appendWhere(ExpressionActionsChain & chain, bool only_types);
bool appendGroupBy(ExpressionActionsChain & chain, bool only_types, bool optimize_aggregation_in_order, ManyExpressionActions &);
void appendAggregateFunctionsArguments(ExpressionActionsChain & chain, bool only_types);
void appendWindowFunctionsArguments(ExpressionActionsChain & chain, bool only_types);
/// After aggregation:
bool appendHaving(ExpressionActionsChain & chain, bool only_types);

View File

@ -19,9 +19,18 @@ void ExpressionInfoMatcher::visit(const ASTPtr & ast, Data & data)
void ExpressionInfoMatcher::visit(const ASTFunction & ast_function, const ASTPtr &, Data & data)
{
if (ast_function.name == "arrayJoin")
{
data.is_array_join = true;
else if (AggregateFunctionFactory::instance().isAggregateFunctionName(ast_function.name))
}
// "is_aggregate_function" doesn't mean much by itself. Apparently here it is
// used to move filters from HAVING to WHERE, and probably for this purpose
// an aggregate function calculated as a window function is not relevant.
else if (!ast_function.is_window_function
&& AggregateFunctionFactory::instance().isAggregateFunctionName(
ast_function.name))
{
data.is_aggregate_function = true;
}
else
{
const auto & function = FunctionFactory::instance().tryGet(ast_function.name, data.context);

View File

@ -19,8 +19,12 @@ public:
struct Data
{
const char * assert_no_aggregates = nullptr;
std::unordered_set<String> uniq_names;
std::vector<const ASTFunction *> aggregates;
const char * assert_no_windows = nullptr;
// Explicit empty initializers are needed to make designated initializers
// work on GCC 10.
std::unordered_set<String> uniq_names {};
std::vector<const ASTFunction *> aggregates {};
std::vector<const ASTFunction *> window_functions {};
};
static bool needChildVisit(const ASTPtr & node, const ASTPtr & child)
@ -28,8 +32,13 @@ public:
if (child->as<ASTSubquery>() || child->as<ASTSelectQuery>())
return false;
if (auto * func = node->as<ASTFunction>())
if (isAggregateFunction(func->name))
{
if (isAggregateFunction(*func)
|| func->is_window_function)
{
return false;
}
}
return true;
}
@ -42,9 +51,8 @@ public:
private:
static void visit(const ASTFunction & node, const ASTPtr &, Data & data)
{
if (!isAggregateFunction(node.name))
return;
if (isAggregateFunction(node))
{
if (data.assert_no_aggregates)
throw Exception("Aggregate function " + node.getColumnName() + " is found " + String(data.assert_no_aggregates) + " in query",
ErrorCodes::ILLEGAL_AGGREGATION);
@ -56,19 +64,43 @@ private:
data.uniq_names.insert(column_name);
data.aggregates.push_back(&node);
}
static bool isAggregateFunction(const String & name)
else if (node.is_window_function)
{
return AggregateFunctionFactory::instance().isAggregateFunctionName(name);
if (data.assert_no_windows)
throw Exception("Window function " + node.getColumnName() + " is found " + String(data.assert_no_windows) + " in query",
ErrorCodes::ILLEGAL_AGGREGATION);
String column_name = node.getColumnName();
if (data.uniq_names.count(column_name))
return;
data.uniq_names.insert(column_name);
data.window_functions.push_back(&node);
}
}
static bool isAggregateFunction(const ASTFunction & node)
{
// Aggregate functions can also be calculated as window functions, but
// here we are interested in aggregate functions calculated in GROUP BY.
return !node.is_window_function
&& AggregateFunctionFactory::instance().isAggregateFunctionName(
node.name);
}
};
using GetAggregatesVisitor = GetAggregatesMatcher::Visitor;
inline void assertNoWindows(const ASTPtr & ast, const char * description)
{
GetAggregatesVisitor::Data data{.assert_no_windows = description};
GetAggregatesVisitor(data).visit(ast);
}
inline void assertNoAggregates(const ASTPtr & ast, const char * description)
{
GetAggregatesVisitor::Data data{description, {}, {}};
GetAggregatesVisitor::Data data{.assert_no_aggregates = description};
GetAggregatesVisitor(data).visit(ast);
}

View File

@ -35,36 +35,37 @@
#include <Interpreters/QueryAliasesVisitor.h>
#include <Processors/Pipe.h>
#include <Processors/Sources/SourceFromInputStream.h>
#include <Processors/Sources/NullSource.h>
#include <Processors/Transforms/ExpressionTransform.h>
#include <Processors/Transforms/JoiningTransform.h>
#include <Processors/Transforms/AggregatingTransform.h>
#include <Processors/Transforms/FilterTransform.h>
#include <Processors/QueryPlan/ArrayJoinStep.h>
#include <Processors/QueryPlan/SettingQuotaAndLimitsStep.h>
#include <Processors/QueryPlan/ExpressionStep.h>
#include <Processors/QueryPlan/FilterStep.h>
#include <Processors/QueryPlan/ReadNothingStep.h>
#include <Processors/QueryPlan/ReadFromPreparedSource.h>
#include <Processors/QueryPlan/PartialSortingStep.h>
#include <Processors/QueryPlan/MergeSortingStep.h>
#include <Processors/QueryPlan/MergingSortedStep.h>
#include <Processors/QueryPlan/DistinctStep.h>
#include <Processors/QueryPlan/LimitByStep.h>
#include <Processors/QueryPlan/LimitStep.h>
#include <Processors/QueryPlan/MergingAggregatedStep.h>
#include <Processors/QueryPlan/AddingDelayedSourceStep.h>
#include <Processors/QueryPlan/AggregatingStep.h>
#include <Processors/QueryPlan/ArrayJoinStep.h>
#include <Processors/QueryPlan/CreatingSetsStep.h>
#include <Processors/QueryPlan/TotalsHavingStep.h>
#include <Processors/QueryPlan/RollupStep.h>
#include <Processors/QueryPlan/CubeStep.h>
#include <Processors/QueryPlan/FillingStep.h>
#include <Processors/QueryPlan/DistinctStep.h>
#include <Processors/QueryPlan/ExpressionStep.h>
#include <Processors/QueryPlan/ExtremesStep.h>
#include <Processors/QueryPlan/OffsetStep.h>
#include <Processors/QueryPlan/FillingStep.h>
#include <Processors/QueryPlan/FilterStep.h>
#include <Processors/QueryPlan/FinishSortingStep.h>
#include <Processors/QueryPlan/LimitByStep.h>
#include <Processors/QueryPlan/LimitStep.h>
#include <Processors/QueryPlan/MergeSortingStep.h>
#include <Processors/QueryPlan/MergingAggregatedStep.h>
#include <Processors/QueryPlan/MergingSortedStep.h>
#include <Processors/QueryPlan/OffsetStep.h>
#include <Processors/QueryPlan/PartialSortingStep.h>
#include <Processors/QueryPlan/QueryPlan.h>
#include <Processors/QueryPlan/ReadFromPreparedSource.h>
#include <Processors/QueryPlan/ReadNothingStep.h>
#include <Processors/QueryPlan/RollupStep.h>
#include <Processors/QueryPlan/SettingQuotaAndLimitsStep.h>
#include <Processors/QueryPlan/TotalsHavingStep.h>
#include <Processors/QueryPlan/WindowStep.h>
#include <Processors/Sources/NullSource.h>
#include <Processors/Sources/SourceFromInputStream.h>
#include <Processors/Transforms/AggregatingTransform.h>
#include <Processors/Transforms/ExpressionTransform.h>
#include <Processors/Transforms/FilterTransform.h>
#include <Processors/Transforms/JoiningTransform.h>
#include <Storages/MergeTree/MergeTreeData.h>
#include <Storages/MergeTree/MergeTreeWhereOptimizer.h>
@ -958,6 +959,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu
else
{
executeExpression(query_plan, expressions.before_order_and_select, "Before ORDER BY and SELECT");
executeWindow(query_plan);
executeDistinct(query_plan, true, expressions.selected_columns, true);
}
@ -1004,6 +1006,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu
executeHaving(query_plan, expressions.before_having);
executeExpression(query_plan, expressions.before_order_and_select, "Before ORDER BY and SELECT");
executeWindow(query_plan);
executeDistinct(query_plan, true, expressions.selected_columns, true);
}
@ -1749,6 +1752,58 @@ void InterpreterSelectQuery::executeExpression(QueryPlan & query_plan, const Act
}
void InterpreterSelectQuery::executeWindow(QueryPlan & query_plan)
{
for (const auto & [_, w] : query_analyzer->windowDescriptions())
{
const Settings & settings = context->getSettingsRef();
auto partial_sorting = std::make_unique<PartialSortingStep>(
query_plan.getCurrentDataStream(),
w.full_sort_description,
0 /* LIMIT */,
SizeLimits(settings.max_rows_to_sort, settings.max_bytes_to_sort,
settings.sort_overflow_mode));
partial_sorting->setStepDescription("Sort each block for window '"
+ w.window_name + "'");
query_plan.addStep(std::move(partial_sorting));
auto merge_sorting_step = std::make_unique<MergeSortingStep>(
query_plan.getCurrentDataStream(),
w.full_sort_description,
settings.max_block_size,
0 /* LIMIT */,
settings.max_bytes_before_remerge_sort,
settings.remerge_sort_lowered_memory_bytes_ratio,
settings.max_bytes_before_external_sort,
context->getTemporaryVolume(),
settings.min_free_disk_space_for_temporary_data);
merge_sorting_step->setStepDescription("Merge sorted blocks for window '"
+ w.window_name + "'");
query_plan.addStep(std::move(merge_sorting_step));
// First MergeSorted, now MergingSorted.
auto merging_sorted = std::make_unique<MergingSortedStep>(
query_plan.getCurrentDataStream(),
w.full_sort_description,
settings.max_block_size,
0 /* LIMIT */);
merging_sorted->setStepDescription("Merge sorted streams for window '"
+ w.window_name + "'");
query_plan.addStep(std::move(merging_sorted));
auto window_step = std::make_unique<WindowStep>(
query_plan.getCurrentDataStream(),
w,
w.window_functions);
window_step->setStepDescription("Window step for window '"
+ w.window_name + "'");
query_plan.addStep(std::move(window_step));
}
}
void InterpreterSelectQuery::executeOrderOptimized(QueryPlan & query_plan, InputOrderInfoPtr input_sorting_info, UInt64 limit, SortDescription & output_order_descr)
{
const Settings & settings = context->getSettingsRef();
@ -1795,9 +1850,13 @@ void InterpreterSelectQuery::executeOrder(QueryPlan & query_plan, InputOrderInfo
/// Merge the sorted blocks.
auto merge_sorting_step = std::make_unique<MergeSortingStep>(
query_plan.getCurrentDataStream(),
output_order_descr, settings.max_block_size, limit,
settings.max_bytes_before_remerge_sort, settings.remerge_sort_lowered_memory_bytes_ratio,
settings.max_bytes_before_external_sort, context->getTemporaryVolume(),
output_order_descr,
settings.max_block_size,
limit,
settings.max_bytes_before_remerge_sort,
settings.remerge_sort_lowered_memory_bytes_ratio,
settings.max_bytes_before_external_sort,
context->getTemporaryVolume(),
settings.min_free_disk_space_for_temporary_data);
merge_sorting_step->setStepDescription("Merge sorted blocks for ORDER BY");

View File

@ -120,6 +120,8 @@ private:
void executeTotalsAndHaving(QueryPlan & query_plan, bool has_having, const ActionsDAGPtr & expression, bool overflow_row, bool final);
void executeHaving(QueryPlan & query_plan, const ActionsDAGPtr & expression);
static void executeExpression(QueryPlan & query_plan, const ActionsDAGPtr & expression, const std::string & description);
/// FIXME should go through ActionsDAG to behave as a proper function
void executeWindow(QueryPlan & query_plan);
void executeOrder(QueryPlan & query_plan, InputOrderInfoPtr sorting_info);
void executeOrderOptimized(QueryPlan & query_plan, InputOrderInfoPtr sorting_info, UInt64 limit, SortDescription & output_order_descr);
void executeWithFill(QueryPlan & query_plan);

View File

@ -43,9 +43,14 @@ public:
if (group_by_function_hashes.count(key))
return false;
/// if ORDER BY contains aggregate function it shouldn't be optimized
if (AggregateFunctionFactory::instance().isAggregateFunctionName(ast_function.name))
/// if ORDER BY contains aggregate function or window functions, it
/// shouldn't be optimized
if (ast_function.is_window_function
|| AggregateFunctionFactory::instance().isAggregateFunctionName(
ast_function.name))
{
return false;
}
return true;
}

View File

@ -38,8 +38,16 @@ bool extractIdentifiers(const ASTFunction & func, std::unordered_set<ASTPtr *> &
if (arg_func->name == "lambda")
return false;
if (AggregateFunctionFactory::instance().isAggregateFunctionName(arg_func->name))
// We are looking for identifiers inside a function calculated inside
// the aggregate function `any()`. Window or aggregate function can't
// be inside `any`, but this check in GetAggregatesMatcher happens
// later, so we have to explicitly skip these nested functions here.
if (arg_func->is_window_function
|| AggregateFunctionFactory::instance().isAggregateFunctionName(
arg_func->name))
{
return false;
}
if (!extractIdentifiers(*arg_func, identifiers))
return false;

View File

@ -439,12 +439,46 @@ std::vector<const ASTFunction *> getAggregates(ASTPtr & query, const ASTSelectQu
/// There can not be other aggregate functions within the aggregate functions.
for (const ASTFunction * node : data.aggregates)
{
if (node->arguments)
{
for (auto & arg : node->arguments->children)
{
assertNoAggregates(arg, "inside another aggregate function");
assertNoWindows(arg, "inside an aggregate function");
}
}
}
return data.aggregates;
}
std::vector<const ASTFunction *> getWindowFunctions(ASTPtr & query, const ASTSelectQuery & select_query)
{
/// There can not be window functions inside the WHERE and PREWHERE.
if (select_query.where())
assertNoWindows(select_query.where(), "in WHERE");
if (select_query.prewhere())
assertNoWindows(select_query.prewhere(), "in PREWHERE");
GetAggregatesVisitor::Data data;
GetAggregatesVisitor(data).visit(query);
/// There can not be other window functions within the aggregate functions.
for (const ASTFunction * node : data.window_functions)
{
if (node->arguments)
{
for (auto & arg : node->arguments->children)
{
assertNoAggregates(arg, "inside a window function");
assertNoWindows(arg, "inside another window function");
}
}
}
return data.window_functions;
}
}
TreeRewriterResult::TreeRewriterResult(
@ -640,14 +674,24 @@ void TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select
for (const auto & name : columns_context.requiredColumns())
ss << " '" << name << "'";
if (!source_column_names.empty())
if (storage)
{
ss << ", source columns:";
for (const auto & name : source_column_names)
ss << " '" << name << "'";
ss << ", maybe you meant: ";
for (const auto & name : columns_context.requiredColumns())
{
auto hints = storage->getHints(name);
if (!hints.empty())
ss << " '" << toString(hints) << "'";
}
}
else
{
if (!source_column_names.empty())
for (const auto & name : columns_context.requiredColumns())
ss << " '" << name << "'";
else
ss << ", no source columns";
}
if (columns_context.has_table_join)
{
@ -733,6 +777,7 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect(
collectJoinedColumns(*result.analyzed_join, *select_query, tables_with_columns, result.aliases);
result.aggregates = getAggregates(query, *select_query);
result.window_function_asts = getWindowFunctions(query, *select_query);
result.collectUsedColumns(query, true);
result.ast_join = select_query->join();

View File

@ -35,6 +35,8 @@ struct TreeRewriterResult
Aliases aliases;
std::vector<const ASTFunction *> aggregates;
std::vector<const ASTFunction *> window_function_asts;
/// Which column is needed to be ARRAY-JOIN'ed to get the specified.
/// For example, for `SELECT s.v ... ARRAY JOIN a AS s` will get "s.v" -> "a.v".
NameToNameMap array_join_result_to_source;

View File

@ -32,6 +32,9 @@ target_link_libraries (string_hash_map_aggregation PRIVATE dbms)
add_executable (string_hash_set string_hash_set.cpp)
target_link_libraries (string_hash_set PRIVATE dbms)
add_executable (context context.cpp)
target_link_libraries (context PRIVATE dbms)
add_executable (two_level_hash_map two_level_hash_map.cpp)
target_include_directories (two_level_hash_map SYSTEM BEFORE PRIVATE ${SPARSEHASH_INCLUDE_DIR})
target_link_libraries (two_level_hash_map PRIVATE dbms)

View File

@ -0,0 +1,90 @@
#include <iostream>
/// #define BOOST_USE_UCONTEXT
#include <Common/Fiber.h>
// #include <boost/context/pooled_fixedsize_stack.hpp>
// #include <boost/context/segmented_stack.hpp>
#include <Common/Exception.h>
#include <Common/FiberStack.h>
void __attribute__((__noinline__)) foo(std::exception_ptr exception)
{
if (exception)
std::rethrow_exception(exception);
}
void __attribute__((__noinline__)) bar(int a)
{
std::cout << StackTrace().toString() << std::endl;
if (a > 0)
throw DB::Exception(0, "hello");
}
void __attribute__((__noinline__)) gar(int a)
{
char buf[1024];
buf[1023] = a & 255;
if (a > 2)
return gar(a - 1);
else
bar(a);
}
int main(int, char **)
try {
namespace ctx=boost::context;
int a;
std::exception_ptr exception;
// ctx::protected_fixedsize allocator
// ctx::pooled_fixedsize_stack(1024 * 64 + 2 * 2 * 1024 * 1024 * 16, 1)
ctx::fiber source{std::allocator_arg_t(), FiberStack(), [&](ctx::fiber&& sink)
{
a=0;
int b=1;
for (size_t i = 0; i < 9; ++i)
{
sink=std::move(sink).resume();
int next=a+b;
a=b;
b=next;
}
try
{
gar(1024);
}
catch (...)
{
std::cout << "Saving exception\n";
exception = std::current_exception();
}
return std::move(sink);
}};
for (int j=0;j<10;++j)
{
try
{
source=std::move(source).resume();
}
catch (DB::Exception & e)
{
std::cout << "Caught exception in resume " << e.getStackTraceString() << std::endl;
}
std::cout << a << " ";
}
std::cout << std::endl;
try
{
foo(exception);
}
catch (const DB::Exception & e)
{
std::cout << e.getStackTraceString() << std::endl;
}
}
catch (...)
{
std::cerr << "Uncaught exception\n";
}

View File

@ -1,14 +1,15 @@
#include <Common/typeid_cast.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTWithAlias.h>
#include <Parsers/ASTSubquery.h>
#include <Parsers/ASTExpressionList.h>
#include <IO/WriteHelpers.h>
#include <IO/WriteBufferFromString.h>
#include <Common/SipHash.h>
#include <IO/Operators.h>
#include <Common/SipHash.h>
#include <Common/typeid_cast.h>
#include <IO/Operators.h>
#include <IO/WriteBufferFromString.h>
#include <IO/WriteHelpers.h>
#include <Parsers/ASTExpressionList.h>
#include <Parsers/ASTIdentifier.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/ASTSubquery.h>
#include <Parsers/ASTWithAlias.h>
namespace DB
{
@ -54,6 +55,21 @@ ASTPtr ASTFunction::clone() const
if (arguments) { res->arguments = arguments->clone(); res->children.push_back(res->arguments); }
if (parameters) { res->parameters = parameters->clone(); res->children.push_back(res->parameters); }
if (window_name)
{
res->set(res->window_name, window_name->clone());
}
if (window_partition_by)
{
res->set(res->window_partition_by, window_partition_by->clone());
}
if (window_order_by)
{
res->set(res->window_order_by, window_order_by->clone());
}
return res;
}
@ -411,8 +427,11 @@ void ASTFunction::formatImplWithoutAlias(const FormatSettings & settings, Format
}
}
if (!written)
if (written)
{
return;
}
settings.ostr << (settings.hilite ? hilite_function : "") << name;
if (parameters)
@ -449,6 +468,50 @@ void ASTFunction::formatImplWithoutAlias(const FormatSettings & settings, Format
settings.ostr << (settings.hilite ? hilite_function : "") << ')';
settings.ostr << (settings.hilite ? hilite_none : "");
if (!is_window_function)
{
return;
}
settings.ostr << " OVER (";
appendWindowDescription(settings, state, nested_dont_need_parens);
settings.ostr << ")";
}
std::string ASTFunction::getWindowDescription() const
{
WriteBufferFromOwnString ostr;
FormatSettings settings{ostr, true /* one_line */};
FormatState state;
FormatStateStacked frame;
appendWindowDescription(settings, state, frame);
return ostr.str();
}
void ASTFunction::appendWindowDescription(const FormatSettings & settings,
FormatState & state, FormatStateStacked frame) const
{
if (!is_window_function)
{
return;
}
if (window_partition_by)
{
settings.ostr << "PARTITION BY ";
window_partition_by->formatImpl(settings, state, frame);
}
if (window_partition_by && window_order_by)
{
settings.ostr << " ";
}
if (window_order_by)
{
settings.ostr << "ORDER BY ";
window_order_by->formatImpl(settings, state, frame);
}
}

View File

@ -8,6 +8,8 @@
namespace DB
{
class ASTIdentifier;
/** AST for function application or operator.
*/
class ASTFunction : public ASTWithAlias
@ -18,6 +20,11 @@ public:
/// parameters - for parametric aggregate function. Example: quantile(0.9)(x) - what in first parens are 'parameters'.
ASTPtr parameters;
bool is_window_function = false;
ASTIdentifier * window_name;
ASTExpressionList * window_partition_by;
ASTExpressionList * window_order_by;
/// do not print empty parentheses if there are no args - compatibility with new AST for data types and engine names.
bool no_empty_args = false;
@ -32,6 +39,11 @@ public:
ASTPtr toLiteral() const; // Try to convert functions like Array or Tuple to a literal form.
void appendWindowDescription(const FormatSettings & settings,
FormatState & state, FormatStateStacked frame) const;
std::string getWindowDescription() const;
protected:
void formatImplWithoutAlias(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override;
void appendColumnNameImpl(WriteBuffer & ostr) const override;

View File

@ -1,6 +1,8 @@
#include <Parsers/ASTIndexDeclaration.h>
#include <Common/quoteString.h>
#include <IO/Operators.h>
#include <Parsers/ASTFunction.h>
namespace DB

View File

@ -1,12 +1,12 @@
#pragma once
#include <Parsers/ASTFunction.h>
#include <Parsers/IAST.h>
namespace DB
{
class ASTFunction;
/** name BY expr TYPE typename(args) GRANULARITY int in create query
*/
class ASTIndexDeclaration : public IAST

View File

@ -263,6 +263,7 @@ bool ParserFunction::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
ParserKeyword distinct("DISTINCT");
ParserExpressionList contents(false);
ParserSelectWithUnionQuery select;
ParserKeyword over("OVER");
bool has_distinct_modifier = false;
@ -382,10 +383,96 @@ bool ParserFunction::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
function_node->children.push_back(function_node->parameters);
}
if (over.ignore(pos, expected))
{
function_node->is_window_function = true;
// We are slightly breaking the parser interface by parsing the window
// definition into an existing ASTFunction. Normally it would take a
// reference to ASTPtr and assign it the new node. We only have a pointer
// of a different type, hence this workaround with a temporary pointer.
ASTPtr function_node_as_iast = function_node;
ParserWindowDefinition window_definition;
if (!window_definition.parse(pos, function_node_as_iast, expected))
{
return false;
}
}
node = function_node;
return true;
}
bool ParserWindowDefinition::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
{
ASTFunction * function = dynamic_cast<ASTFunction *>(node.get());
// Variant 1:
// function_name ( * ) OVER window_name
// FIXME doesn't work anyway for now -- never used anywhere, window names
// can't be defined, and TreeRewriter thinks the window name is a column so
// the query fails.
if (pos->type != TokenType::OpeningRoundBracket)
{
ASTPtr window_name_ast;
ParserIdentifier window_name_parser;
if (window_name_parser.parse(pos, window_name_ast, expected))
{
function->set(function->window_name, window_name_ast);
return true;
}
else
{
return false;
}
}
++pos;
// Variant 2:
// function_name ( * ) OVER ( window_definition )
ParserKeyword keyword_partition_by("PARTITION BY");
ParserNotEmptyExpressionList columns_partition_by(
false /* we don't allow declaring aliases here*/);
ParserKeyword keyword_order_by("ORDER BY");
ParserOrderByExpressionList columns_order_by;
if (keyword_partition_by.ignore(pos, expected))
{
ASTPtr partition_by_ast;
if (columns_partition_by.parse(pos, partition_by_ast, expected))
{
function->set(function->window_partition_by, partition_by_ast);
}
else
{
return false;
}
}
if (keyword_order_by.ignore(pos, expected))
{
ASTPtr order_by_ast;
if (columns_order_by.parse(pos, order_by_ast, expected))
{
function->set(function->window_order_by, order_by_ast);
}
else
{
return false;
}
}
if (pos->type != TokenType::ClosingRoundBracket)
{
expected.add(pos, "')'");
return false;
}
++pos;
return true;
}
bool ParserCodecDeclarationList::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
{
return ParserList(std::make_unique<ParserIdentifierWithOptionalParameters>(),

View File

@ -156,6 +156,13 @@ protected:
bool allow_function_parameters;
};
// Window definition (the thing that goes after OVER) for window function.
class ParserWindowDefinition : public IParserBase
{
const char * getName() const override { return "window definition"; }
bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override;
};
class ParserCodecDeclarationList : public IParserBase
{
protected:

View File

@ -161,4 +161,11 @@ void IAST::dumpTree(WriteBuffer & ostr, size_t indent) const
}
}
std::string IAST::dumpTree(size_t indent) const
{
WriteBufferFromOwnString wb;
dumpTree(wb, indent);
return wb.str();
}
}

View File

@ -19,9 +19,6 @@ namespace DB
namespace ErrorCodes
{
extern const int NOT_A_COLUMN;
extern const int UNKNOWN_TYPE_OF_AST_NODE;
extern const int UNKNOWN_ELEMENT_IN_AST;
extern const int LOGICAL_ERROR;
}
@ -46,7 +43,7 @@ public:
String getColumnName() const;
virtual void appendColumnName(WriteBuffer &) const
{
throw Exception("Trying to get name of not a column: " + getID(), ErrorCodes::NOT_A_COLUMN);
throw Exception("Trying to get name of not a column: " + getID(), ErrorCodes::LOGICAL_ERROR);
}
/** Get the alias, if any, or the canonical name of the column, if it is not. */
@ -58,7 +55,7 @@ public:
/** Set the alias. */
virtual void setAlias(const String & /*to*/)
{
throw Exception("Can't set alias of " + getColumnName(), ErrorCodes::UNKNOWN_TYPE_OF_AST_NODE);
throw Exception("Can't set alias of " + getColumnName(), ErrorCodes::LOGICAL_ERROR);
}
/** Get the text that identifies this element. */
@ -77,6 +74,7 @@ public:
virtual void updateTreeHashImpl(SipHash & hash_state) const;
void dumpTree(WriteBuffer & ostr, size_t indent = 0) const;
std::string dumpTree(size_t indent = 0) const;
/** Check the depth of the tree.
* If max_depth is specified and the depth is greater - throw an exception.
@ -160,6 +158,7 @@ public:
bool always_quote_identifiers = false;
IdentifierQuotingStyle identifier_quoting_style = IdentifierQuotingStyle::Backticks;
// Newline or whitespace.
char nl_or_ws;
FormatSettings(WriteBuffer & ostr_, bool one_line_)
@ -208,7 +207,7 @@ public:
virtual void formatImpl(const FormatSettings & /*settings*/, FormatState & /*state*/, FormatStateStacked /*frame*/) const
{
throw Exception("Unknown element in AST: " + getID(), ErrorCodes::UNKNOWN_ELEMENT_IN_AST);
throw Exception("Unknown element in AST: " + getID(), ErrorCodes::LOGICAL_ERROR);
}
// A simple way to add some user-readable context to an error message.

View File

@ -2,6 +2,7 @@
#include <Parsers/ASTColumnDeclaration.h>
#include <Parsers/ASTConstraintDeclaration.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTIndexDeclaration.h>
#include <Parsers/New/AST/ColumnExpr.h>
#include <Parsers/New/AST/ColumnTypeExpr.h>

View File

@ -164,7 +164,7 @@ bool PipelineExecutor::expandPipeline(Stack & stack, UInt64 pid)
return true;
}
bool PipelineExecutor::tryAddProcessorToStackIfUpdated(ExecutingGraph::Edge & edge, Queue & queue, size_t thread_number)
bool PipelineExecutor::tryAddProcessorToStackIfUpdated(ExecutingGraph::Edge & edge, Queue & queue, Queue & async_queue, size_t thread_number)
{
/// In this method we have ownership on edge, but node can be concurrently accessed.
@ -185,7 +185,7 @@ bool PipelineExecutor::tryAddProcessorToStackIfUpdated(ExecutingGraph::Edge & ed
if (status == ExecutingGraph::ExecStatus::Idle)
{
node.status = ExecutingGraph::ExecStatus::Preparing;
return prepareProcessor(edge.to, thread_number, queue, std::move(lock));
return prepareProcessor(edge.to, thread_number, queue, async_queue, std::move(lock));
}
else
graph->nodes[edge.to]->processor->onUpdatePorts();
@ -193,7 +193,7 @@ bool PipelineExecutor::tryAddProcessorToStackIfUpdated(ExecutingGraph::Edge & ed
return true;
}
bool PipelineExecutor::prepareProcessor(UInt64 pid, size_t thread_number, Queue & queue, std::unique_lock<std::mutex> node_lock)
bool PipelineExecutor::prepareProcessor(UInt64 pid, size_t thread_number, Queue & queue, Queue & async_queue, std::unique_lock<std::mutex> node_lock)
{
/// In this method we have ownership on node.
auto & node = *graph->nodes[pid];
@ -248,15 +248,9 @@ bool PipelineExecutor::prepareProcessor(UInt64 pid, size_t thread_number, Queue
}
case IProcessor::Status::Async:
{
throw Exception("Async is temporary not supported.", ErrorCodes::LOGICAL_ERROR);
// node.status = ExecStatus::Executing;
// addAsyncJob(pid);
// break;
}
case IProcessor::Status::Wait:
{
throw Exception("Wait is temporary not supported.", ErrorCodes::LOGICAL_ERROR);
node.status = ExecutingGraph::ExecStatus::Executing;
async_queue.push(&node);
break;
}
case IProcessor::Status::ExpandPipeline:
{
@ -288,13 +282,13 @@ bool PipelineExecutor::prepareProcessor(UInt64 pid, size_t thread_number, Queue
{
for (auto & edge : updated_direct_edges)
{
if (!tryAddProcessorToStackIfUpdated(*edge, queue, thread_number))
if (!tryAddProcessorToStackIfUpdated(*edge, queue, async_queue, thread_number))
return false;
}
for (auto & edge : updated_back_edges)
{
if (!tryAddProcessorToStackIfUpdated(*edge, queue, thread_number))
if (!tryAddProcessorToStackIfUpdated(*edge, queue, async_queue, thread_number))
return false;
}
}
@ -325,7 +319,7 @@ bool PipelineExecutor::prepareProcessor(UInt64 pid, size_t thread_number, Queue
while (!stack.empty())
{
auto item = stack.top();
if (!prepareProcessor(item, thread_number, queue, std::unique_lock<std::mutex>(graph->nodes[item]->status_mutex)))
if (!prepareProcessor(item, thread_number, queue, async_queue, std::unique_lock<std::mutex>(graph->nodes[item]->status_mutex)))
return false;
stack.pop();
@ -378,6 +372,7 @@ void PipelineExecutor::finish()
{
std::lock_guard lock(task_queue_mutex);
finished = true;
async_task_queue.finish();
}
std::lock_guard guard(executor_contexts_mutex);
@ -502,11 +497,21 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, size_t num_threads, st
{
std::unique_lock lock(task_queue_mutex);
if (!task_queue.empty())
if (!context->async_tasks.empty())
{
node = context->async_tasks.front();
context->async_tasks.pop();
--num_waiting_async_tasks;
if (context->async_tasks.empty())
context->has_async_tasks = false;
}
else if (!task_queue.empty())
node = task_queue.pop(thread_num);
if (!task_queue.empty() && !threads_queue.empty() /*&& task_queue.quota() > threads_queue.size()*/)
if (node)
{
if (!task_queue.empty() && !threads_queue.empty())
{
auto thread_to_wake = task_queue.getAnyThreadWithTasks(thread_num + 1 == num_threads ? 0 : (thread_num + 1));
@ -522,13 +527,26 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, size_t num_threads, st
break;
}
if (threads_queue.size() + 1 == num_threads)
if (threads_queue.size() + 1 == num_threads && async_task_queue.empty() && num_waiting_async_tasks == 0)
{
lock.unlock();
finish();
break;
}
#if defined(OS_LINUX)
if (num_threads == 1)
{
/// If we execute in single thread, wait for async tasks here.
auto res = async_task_queue.wait(lock);
if (!res)
throw Exception("Empty task was returned from async task queue", ErrorCodes::LOGICAL_ERROR);
node = static_cast<ExecutingGraph::Node *>(res.data);
break;
}
#endif
threads_queue.push(thread_num);
}
@ -579,6 +597,7 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, size_t num_threads, st
/// Try to execute neighbour processor.
{
Queue queue;
Queue async_queue;
++num_processing_executors;
while (auto * task = expand_pipeline_task.load())
@ -587,31 +606,39 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, size_t num_threads, st
/// Prepare processor after execution.
{
auto lock = std::unique_lock<std::mutex>(node->status_mutex);
if (!prepareProcessor(node->processors_id, thread_num, queue, std::move(lock)))
if (!prepareProcessor(node->processors_id, thread_num, queue, async_queue, std::move(lock)))
finish();
}
node = nullptr;
/// Take local task from queue if has one.
if (!queue.empty())
if (!queue.empty() && !context->has_async_tasks)
{
node = queue.front();
queue.pop();
}
/// Push other tasks to global queue.
if (!queue.empty())
if (!queue.empty() || !async_queue.empty())
{
std::unique_lock lock(task_queue_mutex);
#if defined(OS_LINUX)
while (!async_queue.empty() && !finished)
{
async_task_queue.addTask(thread_num, async_queue.front(), async_queue.front()->processor->schedule());
async_queue.pop();
}
#endif
while (!queue.empty() && !finished)
{
task_queue.push(queue.front(), thread_num);
queue.pop();
}
if (!threads_queue.empty() && !finished /* && task_queue.quota() > threads_queue.size()*/)
if (!threads_queue.empty() && !task_queue.empty() && !finished)
{
auto thread_to_wake = task_queue.getAnyThreadWithTasks(thread_num + 1 == num_threads ? 0 : (thread_num + 1));
@ -669,6 +696,7 @@ void PipelineExecutor::initializeExecution(size_t num_threads)
std::lock_guard lock(task_queue_mutex);
Queue queue;
Queue async_queue;
size_t next_thread = 0;
while (!stack.empty())
@ -676,7 +704,7 @@ void PipelineExecutor::initializeExecution(size_t num_threads)
UInt64 proc = stack.top();
stack.pop();
prepareProcessor(proc, 0, queue, std::unique_lock<std::mutex>(graph->nodes[proc]->status_mutex));
prepareProcessor(proc, 0, queue, async_queue, std::unique_lock<std::mutex>(graph->nodes[proc]->status_mutex));
while (!queue.empty())
{
@ -687,6 +715,10 @@ void PipelineExecutor::initializeExecution(size_t num_threads)
if (next_thread >= num_threads)
next_thread = 0;
}
while (!async_queue.empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Async is only possible after work() call. Processor {}",
async_queue.front()->processor->getName());
}
}
}
@ -747,6 +779,26 @@ void PipelineExecutor::executeImpl(size_t num_threads)
});
}
#if defined(OS_LINUX)
{
/// Wait for async tasks.
std::unique_lock lock(task_queue_mutex);
while (auto task = async_task_queue.wait(lock))
{
auto * node = static_cast<ExecutingGraph::Node *>(task.data);
executor_contexts[task.thread_num]->async_tasks.push(node);
executor_contexts[task.thread_num]->has_async_tasks = true;
++num_waiting_async_tasks;
if (threads_queue.has(task.thread_num))
{
threads_queue.pop(task.thread_num);
wakeUpExecutor(task.thread_num);
}
}
}
#endif
for (auto & thread : threads)
if (thread.joinable())
thread.join();

View File

@ -1,6 +1,7 @@
#pragma once
#include <Processors/IProcessor.h>
#include <Processors/Executors/PollingQueue.h>
#include <Processors/Executors/ThreadsQueue.h>
#include <Processors/Executors/TasksQueue.h>
#include <Processors/Executors/ExecutingGraph.h>
@ -57,6 +58,12 @@ private:
/// Stores processors need to be prepared. Preparing status is already set for them.
TaskQueue<ExecutingGraph::Node> task_queue;
/// Queue which stores tasks where processors returned Async status after prepare.
/// If multiple threads are using, main thread will wait for async tasks.
/// For single thread, will wait for async tasks only when task_queue is empty.
PollingQueue async_task_queue;
size_t num_waiting_async_tasks = 0;
ThreadsQueue threads_queue;
std::mutex task_queue_mutex;
@ -90,6 +97,9 @@ private:
/// This can be solved by using atomic shard ptr.
std::list<ExpandPipelineTask> task_list;
std::queue<ExecutingGraph::Node *> async_tasks;
std::atomic_bool has_async_tasks = false;
std::condition_variable condvar;
std::mutex mutex;
bool wake_flag = false;
@ -126,14 +136,14 @@ private:
/// Pipeline execution related methods.
void addChildlessProcessorsToStack(Stack & stack);
bool tryAddProcessorToStackIfUpdated(ExecutingGraph::Edge & edge, Queue & queue, size_t thread_number);
bool tryAddProcessorToStackIfUpdated(ExecutingGraph::Edge & edge, Queue & queue, Queue & async_queue, size_t thread_number);
static void addJob(ExecutingGraph::Node * execution_state);
// TODO: void addAsyncJob(UInt64 pid);
/// Prepare processor with pid number.
/// Check parents and children of current processor and push them to stacks if they also need to be prepared.
/// If processor wants to be expanded, ExpandPipelineTask from thread_number's execution context will be used.
bool prepareProcessor(UInt64 pid, size_t thread_number, Queue & queue, std::unique_lock<std::mutex> node_lock);
bool prepareProcessor(UInt64 pid, size_t thread_number, Queue & queue, Queue & async_queue, std::unique_lock<std::mutex> node_lock);
bool doExpandPipeline(ExpandPipelineTask * task, bool processing);
/// Continue executor (in case there are tasks in queue).

View File

@ -0,0 +1,115 @@
#include <Processors/Executors/PollingQueue.h>
#if defined(OS_LINUX)
#include <Common/Exception.h>
#include <sys/epoll.h>
#include <unistd.h>
#include <fcntl.h>
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_OPEN_FILE;
extern const int CANNOT_READ_FROM_SOCKET;
extern const int LOGICAL_ERROR;
}
PollingQueue::PollingQueue()
{
epoll_fd = epoll_create(1);
if (-1 == epoll_fd)
throwFromErrno("Cannot create epoll descriptor", ErrorCodes::CANNOT_OPEN_FILE);
if (-1 == pipe2(pipe_fd, O_NONBLOCK))
throwFromErrno("Cannot create pipe", ErrorCodes::CANNOT_OPEN_FILE);
epoll_event socket_event;
socket_event.events = EPOLLIN | EPOLLPRI;
socket_event.data.ptr = pipe_fd;
if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, pipe_fd[0], &socket_event))
throwFromErrno("Cannot add pipe descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE);
}
PollingQueue::~PollingQueue()
{
close(epoll_fd);
close(pipe_fd[0]);
close(pipe_fd[1]);
}
void PollingQueue::addTask(size_t thread_number, void * data, int fd)
{
std::uintptr_t key = reinterpret_cast<uintptr_t>(data);
if (tasks.count(key))
throw Exception("Task was already added to task queue", ErrorCodes::LOGICAL_ERROR);
tasks[key] = TaskData{thread_number, data, fd};
epoll_event socket_event;
socket_event.events = EPOLLIN | EPOLLPRI;
socket_event.data.ptr = data;
if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &socket_event))
throwFromErrno("Cannot add socket descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE);
}
PollingQueue::TaskData PollingQueue::wait(std::unique_lock<std::mutex> & lock)
{
if (is_finished)
return {};
lock.unlock();
epoll_event event;
event.data.ptr = nullptr;
int num_events = 0;
while (num_events == 0)
{
num_events = epoll_wait(epoll_fd, &event, 1, 0);
if (num_events == -1)
throwFromErrno("Failed to epoll_wait", ErrorCodes::CANNOT_READ_FROM_SOCKET);
}
lock.lock();
if (event.data.ptr == pipe_fd)
return {};
std::uintptr_t key = reinterpret_cast<uintptr_t>(event.data.ptr);
auto it = tasks.find(key);
if (it == tasks.end())
throw Exception("Task was not found in task queue", ErrorCodes::LOGICAL_ERROR);
auto res = it->second;
tasks.erase(it);
if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_DEL, res.fd, &event))
throwFromErrno("Cannot remove socket descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE);
return res;
}
void PollingQueue::finish()
{
is_finished = true;
tasks.clear();
uint64_t buf = 0;
while (-1 == write(pipe_fd[1], &buf, sizeof(buf)))
{
if (errno == EAGAIN)
break;
if (errno != EINTR)
throwFromErrno("Cannot write to pipe", ErrorCodes::CANNOT_READ_FROM_SOCKET);
}
}
}
#endif

View File

@ -0,0 +1,60 @@
#pragma once
#include <cstddef>
#include <cstdint>
#include <mutex>
#include <atomic>
#include <unordered_map>
namespace DB
{
#if defined(OS_LINUX)
/// This queue is used to poll descriptors. Generally, just a wrapper over epoll.
class PollingQueue
{
public:
struct TaskData
{
size_t thread_num;
void * data = nullptr;
int fd = -1;
explicit operator bool() const { return data; }
};
private:
int epoll_fd;
int pipe_fd[2];
std::atomic_bool is_finished = false;
std::unordered_map<std::uintptr_t, TaskData> tasks;
public:
PollingQueue();
~PollingQueue();
size_t size() const { return tasks.size(); }
bool empty() const { return tasks.empty(); }
/// Add new task to queue.
void addTask(size_t thread_number, void * data, int fd);
/// Wait for any descriptor. If no descriptors in queue, blocks.
/// Returns ptr which was inserted into queue or nullptr if finished was called.
/// Lock is unlocked during waiting.
TaskData wait(std::unique_lock<std::mutex> & lock);
/// Interrupt waiting.
void finish();
};
#else
class PollingQueue
{
public:
bool empty() { return true; }
void finish() {}
};
#endif
}

View File

@ -56,7 +56,6 @@ protected:
case IProcessor::Status::NeedData:
case IProcessor::Status::Async:
case IProcessor::Status::Wait:
case IProcessor::Status::ExpandPipeline:
throw Exception("Source processor returned status " + IProcessor::statusToName(status), ErrorCodes::LOGICAL_ERROR);
}

View File

@ -33,8 +33,6 @@ std::string IProcessor::statusToName(Status status)
return "Ready";
case Status::Async:
return "Async";
case Status::Wait:
return "Wait";
case Status::ExpandPipeline:
return "ExpandPipeline";
}

View File

@ -146,13 +146,10 @@ public:
/// You may call 'work' method and processor will do some work synchronously.
Ready,
/// You may call 'schedule' method and processor will initiate some background work.
/// You may call 'schedule' method and processor will return descriptor.
/// You need to poll this descriptor and call work() afterwards.
Async,
/// Processor is doing some work in background.
/// You may wait for next event or do something else and then you should call 'prepare' again.
Wait,
/// Processor wants to add other processors to pipeline.
/// New processors must be obtained by expandPipeline() call.
ExpandPipeline,
@ -198,16 +195,21 @@ public:
throw Exception("Method 'work' is not implemented for " + getName() + " processor", ErrorCodes::NOT_IMPLEMENTED);
}
/** You may call this method if 'prepare' returned Async.
/** Executor must call this method when 'prepare' returned Async.
* This method cannot access any ports. It should use only data that was prepared by 'prepare' method.
*
* This method should return instantly and fire an event (or many events) when asynchronous job will be done.
* When the job is not done, method 'prepare' will return Wait and the user may block and wait for next event before checking again.
* This method should instantly return epollable file descriptor which will be readable when asynchronous job is done.
* When descriptor is readable, method `work` is called to continue data processing.
*
* Note that it can fire many events in EventCounter while doing its job,
* and you have to wait for next event (or do something else) every time when 'prepare' returned Wait.
* NOTE: it would be more logical to let `work()` return ASYNC status instead of prepare. This will get
* prepare() -> work() -> schedule() -> work() -> schedule() -> .. -> work() -> prepare()
* chain instead of
* prepare() -> work() -> prepare() -> schedule() -> work() -> prepare() -> schedule() -> .. -> work() -> prepare()
*
* It is expected that executor epoll using level-triggered notifications.
* Read all available data from descriptor before returning ASYNC.
*/
virtual void schedule(EventCounter & /*watch*/)
virtual int schedule()
{
throw Exception("Method 'schedule' is not implemented for " + getName() + " processor", ErrorCodes::NOT_IMPLEMENTED);
}

View File

@ -4,6 +4,11 @@
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
ISource::ISource(Block header)
: IProcessor({}, {std::move(header)}), output(outputs.front())
{
@ -45,12 +50,18 @@ void ISource::work()
{
try
{
current_chunk.chunk = generate();
if (!current_chunk.chunk || isCancelled())
finished = true;
else
if (auto chunk = tryGenerate())
{
current_chunk.chunk = std::move(*chunk);
if (current_chunk.chunk)
has_input = true;
}
else
finished = true;
if (isCancelled())
finished = true;
}
catch (...)
{
finished = true;
@ -58,5 +69,19 @@ void ISource::work()
}
}
Chunk ISource::generate()
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "generate is not implemented for {}", getName());
}
std::optional<Chunk> ISource::tryGenerate()
{
auto chunk = generate();
if (!chunk)
return {};
return chunk;
}
}

View File

@ -15,7 +15,8 @@ protected:
bool got_exception = false;
Port::Data current_chunk;
virtual Chunk generate() = 0;
virtual Chunk generate();
virtual std::optional<Chunk> tryGenerate();
public:
ISource(Block header);

View File

@ -0,0 +1,111 @@
#include <Processors/QueryPlan/WindowStep.h>
#include <Processors/Transforms/WindowTransform.h>
#include <Processors/Transforms/ExpressionTransform.h>
#include <Processors/QueryPipeline.h>
#include <Interpreters/ExpressionActions.h>
#include <IO/Operators.h>
namespace DB
{
static ITransformingStep::Traits getTraits()
{
return ITransformingStep::Traits
{
{
.preserves_distinct_columns = true,
.returns_single_stream = false,
.preserves_number_of_streams = true,
.preserves_sorting = true,
},
{
.preserves_number_of_rows = true
}
};
}
static Block addWindowFunctionResultColumns(const Block & block,
std::vector<WindowFunctionDescription> window_functions)
{
auto result = block;
for (const auto & f : window_functions)
{
ColumnWithTypeAndName column_with_type;
column_with_type.name = f.column_name;
column_with_type.type = f.aggregate_function->getReturnType();
column_with_type.column = column_with_type.type->createColumn();
result.insert(column_with_type);
}
return result;
}
WindowStep::WindowStep(const DataStream & input_stream_,
const WindowDescription & window_description_,
const std::vector<WindowFunctionDescription> & window_functions_)
: ITransformingStep(
input_stream_,
addWindowFunctionResultColumns(input_stream_.header,
window_functions_),
getTraits())
, window_description(window_description_)
, window_functions(window_functions_)
, input_header(input_stream_.header)
{
// We don't remove any columns, only add, so probably we don't have to update
// the output DataStream::distinct_columns.
}
void WindowStep::transformPipeline(QueryPipeline & pipeline)
{
pipeline.addSimpleTransform([&](const Block & /*header*/)
{
return std::make_shared<WindowTransform>(input_header,
output_stream->header, window_description, window_functions);
});
assertBlocksHaveEqualStructure(pipeline.getHeader(), output_stream->header,
"WindowStep transform for '" + window_description.window_name + "'");
}
void WindowStep::describeActions(FormatSettings & settings) const
{
String prefix(settings.offset, ' ');
settings.out << prefix << "Window: (";
if (!window_description.partition_by.empty())
{
settings.out << "PARTITION BY ";
for (size_t i = 0; i < window_description.partition_by.size(); ++i)
{
if (i > 0)
{
settings.out << ", ";
}
settings.out << window_description.partition_by[i].column_name;
}
}
if (!window_description.partition_by.empty()
&& !window_description.order_by.empty())
{
settings.out << " ";
}
if (!window_description.order_by.empty())
{
settings.out << "ORDER BY "
<< dumpSortDescription(window_description.order_by);
}
settings.out << ")\n";
for (size_t i = 0; i < window_functions.size(); ++i)
{
settings.out << prefix << (i == 0 ? "Functions: "
: " ");
settings.out << window_functions[i].column_name << "\n";
}
}
}

View File

@ -0,0 +1,33 @@
#pragma once
#include <Processors/QueryPlan/ITransformingStep.h>
#include <Interpreters/AggregateDescription.h>
namespace DB
{
class ActionsDAG;
using ActionsDAGPtr = std::shared_ptr<ActionsDAG>;
class WindowTransform;
class WindowStep : public ITransformingStep
{
public:
explicit WindowStep(const DataStream & input_stream_,
const WindowDescription & window_description_,
const std::vector<WindowFunctionDescription> & window_functions_);
String getName() const override { return "Window"; }
void transformPipeline(QueryPipeline & pipeline) override;
void describeActions(FormatSettings & settings) const override;
private:
WindowDescription window_description;
std::vector<WindowFunctionDescription> window_functions;
Block input_header;
};
}

View File

@ -1,14 +1,16 @@
#include <Processors/Sources/RemoteSource.h>
#include <DataStreams/RemoteQueryExecutor.h>
#include <DataStreams/RemoteQueryExecutorReadContext.h>
#include <Processors/Transforms/AggregatingTransform.h>
#include <DataTypes/DataTypeAggregateFunction.h>
namespace DB
{
RemoteSource::RemoteSource(RemoteQueryExecutorPtr executor, bool add_aggregation_info_)
RemoteSource::RemoteSource(RemoteQueryExecutorPtr executor, bool add_aggregation_info_, bool async_read_)
: SourceWithProgress(executor->getHeader(), false)
, add_aggregation_info(add_aggregation_info_), query_executor(std::move(executor))
, async_read(async_read_)
{
/// Add AggregatedChunkInfo if we expect DataTypeAggregateFunction as a result.
const auto & sample = getPort().getHeader();
@ -21,15 +23,28 @@ RemoteSource::~RemoteSource() = default;
ISource::Status RemoteSource::prepare()
{
/// Check if query was cancelled before returning Async status. Otherwise it may lead to infinite loop.
if (was_query_canceled)
{
getPort().finish();
return Status::Finished;
}
if (is_async_state)
return Status::Async;
Status status = SourceWithProgress::prepare();
/// To avoid resetting the connection (because of "unfinished" query) in the
/// RemoteQueryExecutor it should be finished explicitly.
if (status == Status::Finished)
query_executor->finish();
{
query_executor->finish(&read_context);
is_async_state = false;
}
return status;
}
Chunk RemoteSource::generate()
std::optional<Chunk> RemoteSource::tryGenerate()
{
/// onCancel() will do the cancel if the query was sent.
if (was_query_canceled)
@ -52,11 +67,28 @@ Chunk RemoteSource::generate()
was_query_sent = true;
}
auto block = query_executor->read();
Block block;
if (async_read)
{
auto res = query_executor->read(read_context);
if (std::holds_alternative<int>(res))
{
fd = std::get<int>(res);
is_async_state = true;
return Chunk();
}
is_async_state = false;
block = std::get<Block>(std::move(res));
}
else
block = query_executor->read();
if (!block)
{
query_executor->finish();
query_executor->finish(&read_context);
return {};
}
@ -77,7 +109,18 @@ Chunk RemoteSource::generate()
void RemoteSource::onCancel()
{
was_query_canceled = true;
query_executor->cancel();
query_executor->cancel(&read_context);
// is_async_state = false;
}
void RemoteSource::onUpdatePorts()
{
if (getPort().isFinished())
{
was_query_canceled = true;
query_executor->finish(&read_context);
// is_async_state = false;
}
}
@ -123,9 +166,9 @@ Chunk RemoteExtremesSource::generate()
Pipe createRemoteSourcePipe(
RemoteQueryExecutorPtr query_executor,
bool add_aggregation_info, bool add_totals, bool add_extremes)
bool add_aggregation_info, bool add_totals, bool add_extremes, bool async_read)
{
Pipe pipe(std::make_shared<RemoteSource>(query_executor, add_aggregation_info));
Pipe pipe(std::make_shared<RemoteSource>(query_executor, add_aggregation_info, async_read));
if (add_totals)
pipe.addTotalsSource(std::make_shared<RemoteTotalsSource>(query_executor));

View File

@ -11,6 +11,8 @@ namespace DB
class RemoteQueryExecutor;
using RemoteQueryExecutorPtr = std::shared_ptr<RemoteQueryExecutor>;
class RemoteQueryExecutorReadContext;
/// Source from RemoteQueryExecutor. Executes remote query and returns query result chunks.
class RemoteSource : public SourceWithProgress
{
@ -18,7 +20,7 @@ public:
/// Flag add_aggregation_info tells if AggregatedChunkInfo should be added to result chunk.
/// AggregatedChunkInfo stores the bucket number used for two-level aggregation.
/// This flag should be typically enabled for queries with GROUP BY which are executed till WithMergeableState.
RemoteSource(RemoteQueryExecutorPtr executor, bool add_aggregation_info_);
RemoteSource(RemoteQueryExecutorPtr executor, bool add_aggregation_info_, bool async_read_);
~RemoteSource() override;
Status prepare() override;
@ -27,14 +29,12 @@ public:
void setRowsBeforeLimitCounter(RowsBeforeLimitCounterPtr counter) { rows_before_limit.swap(counter); }
/// Stop reading from stream if output port is finished.
void onUpdatePorts() override
{
if (getPort().isFinished())
cancel();
}
void onUpdatePorts() override;
int schedule() override { return fd; }
protected:
Chunk generate() override;
std::optional<Chunk> tryGenerate() override;
void onCancel() override;
private:
@ -43,6 +43,11 @@ private:
bool add_aggregation_info = false;
RemoteQueryExecutorPtr query_executor;
RowsBeforeLimitCounterPtr rows_before_limit;
const bool async_read;
bool is_async_state = false;
std::unique_ptr<RemoteQueryExecutorReadContext> read_context;
int fd = -1;
};
/// Totals source from RemoteQueryExecutor.
@ -80,6 +85,6 @@ private:
/// Create pipe with remote sources.
Pipe createRemoteSourcePipe(
RemoteQueryExecutorPtr query_executor,
bool add_aggregation_info, bool add_totals, bool add_extremes);
bool add_aggregation_info, bool add_totals, bool add_extremes, bool async_read);
}

View File

@ -0,0 +1,184 @@
#include <Processors/Transforms/WindowTransform.h>
#include <Interpreters/ExpressionActions.h>
#include <Common/Arena.h>
namespace DB
{
WindowTransform::WindowTransform(const Block & input_header_,
const Block & output_header_,
const WindowDescription & window_description_,
const std::vector<WindowFunctionDescription> & window_function_descriptions
)
: ISimpleTransform(input_header_, output_header_,
false /* skip_empty_chunks */)
, input_header(input_header_)
, window_description(window_description_)
{
workspaces.reserve(window_function_descriptions.size());
for (const auto & f : window_function_descriptions)
{
WindowFunctionWorkspace workspace;
workspace.window_function = f;
const auto & aggregate_function
= workspace.window_function.aggregate_function;
if (!arena && aggregate_function->allocatesMemoryInArena())
{
arena = std::make_unique<Arena>();
}
workspace.argument_column_indices.reserve(
workspace.window_function.argument_names.size());
workspace.argument_columns.reserve(
workspace.window_function.argument_names.size());
for (const auto & argument_name : workspace.window_function.argument_names)
{
workspace.argument_column_indices.push_back(
input_header.getPositionByName(argument_name));
}
workspace.aggregate_function_state.reset(aggregate_function->sizeOfData(),
aggregate_function->alignOfData());
aggregate_function->create(workspace.aggregate_function_state.data());
workspaces.push_back(std::move(workspace));
}
partition_by_indices.reserve(window_description.partition_by.size());
for (const auto & column : window_description.partition_by)
{
partition_by_indices.push_back(
input_header.getPositionByName(column.column_name));
}
partition_start_columns.resize(partition_by_indices.size(), nullptr);
partition_start_row = 0;
}
WindowTransform::~WindowTransform()
{
// Some states may be not created yet if the creation failed.
for (auto & ws : workspaces)
{
ws.window_function.aggregate_function->destroy(
ws.aggregate_function_state.data());
}
}
void WindowTransform::transform(Chunk & chunk)
{
const size_t num_rows = chunk.getNumRows();
auto columns = chunk.detachColumns();
for (auto & ws : workspaces)
{
ws.argument_columns.clear();
for (const auto column_index : ws.argument_column_indices)
{
ws.argument_columns.push_back(columns[column_index].get());
}
ws.result_column = ws.window_function.aggregate_function->getReturnType()
->createColumn();
}
// We loop for all window functions for each row. Switching the loops might
// be more efficient, because we would run less code and access less data in
// the inner loop. If you change this, don't forget to fix the calculation of
// partition boundaries. Probably it has to be precalculated and stored as
// an array of offsets. An interesting optimization would be to pass it as
// an extra column from the previous sorting step -- that step might need to
// make similar comparison anyway, if it's sorting only by the PARTITION BY
// columns.
for (size_t row = 0; row < num_rows; row++)
{
// Check whether the new partition has started. We have to reset the
// aggregate functions when the new partition starts.
assert(partition_start_columns.size() == partition_by_indices.size());
bool new_partition = false;
if (partition_start_columns.empty())
{
// No PARTITION BY at all, do nothing.
}
else if (partition_start_columns[0] == nullptr)
{
// This is the first partition.
new_partition = true;
partition_start_columns.clear();
for (const auto i : partition_by_indices)
{
partition_start_columns.push_back(columns[i]);
}
partition_start_row = row;
}
else
{
// Check whether the new partition started, by comparing all the
// PARTITION BY columns.
size_t first_inequal_column = 0;
for (; first_inequal_column < partition_start_columns.size();
++first_inequal_column)
{
const auto * current_column = columns[
partition_by_indices[first_inequal_column]].get();
if (current_column->compareAt(row, partition_start_row,
*partition_start_columns[first_inequal_column],
1 /* nan_direction_hint */) != 0)
{
break;
}
}
if (first_inequal_column < partition_start_columns.size())
{
// The new partition has started. Remember where.
new_partition = true;
partition_start_columns.clear();
for (const auto i : partition_by_indices)
{
partition_start_columns.push_back(columns[i]);
}
partition_start_row = row;
}
}
for (auto & ws : workspaces)
{
const auto & f = ws.window_function;
const auto * a = f.aggregate_function.get();
auto * buf = ws.aggregate_function_state.data();
if (new_partition)
{
// Reset the aggregate function states.
a->destroy(buf);
a->create(buf);
}
// Update the aggregate function state and save the result.
a->add(buf,
ws.argument_columns.data(),
row,
arena.get());
a->insertResultInto(buf,
*ws.result_column,
arena.get());
}
}
// We have to release the mutable reference to the result column before we
// return this block, or else extra copying may occur when the subsequent
// processors modify the block. Workspaces live longer than individual blocks.
for (auto & ws : workspaces)
{
columns.push_back(std::move(ws.result_column));
}
chunk.setColumns(std::move(columns), num_rows);
}
}

View File

@ -0,0 +1,77 @@
#pragma once
#include <Processors/ISimpleTransform.h>
#include <Interpreters/AggregateDescription.h>
#include <Common/AlignedBuffer.h>
namespace DB
{
class ExpressionActions;
using ExpressionActionsPtr = std::shared_ptr<ExpressionActions>;
class Arena;
// Runtime data for computing one window function
struct WindowFunctionWorkspace
{
WindowFunctionDescription window_function;
AlignedBuffer aggregate_function_state;
std::vector<size_t> argument_column_indices;
// Argument and result columns. Be careful, they are per-chunk.
std::vector<const IColumn *> argument_columns;
MutableColumnPtr result_column;
};
/*
* Computes several window functions that share the same window. The input must
* be sorted correctly for this window (PARTITION BY, then ORDER BY).
*/
class WindowTransform : public ISimpleTransform
{
public:
WindowTransform(
const Block & input_header_,
const Block & output_header_,
const WindowDescription & window_description_,
const std::vector<WindowFunctionDescription> &
window_function_descriptions);
~WindowTransform() override;
String getName() const override
{
return "WindowTransform";
}
static Block transformHeader(Block header, const ExpressionActionsPtr & expression);
void transform(Chunk & chunk) override;
public:
Block input_header;
WindowDescription window_description;
// Indices of the PARTITION BY columns in block.
std::vector<size_t> partition_by_indices;
// The columns for PARTITION BY and the row in these columns where the
// current partition started. They might be in some of the previous blocks,
// so we have to keep the shared ownership of the columns. We don't keep the
// entire block to save memory, only the needed columns, in the same order
// as the partition_by_indices array.
// Can be empty if there is no PARTITION BY.
// Columns are nullptr when it is the first partition.
std::vector<ColumnPtr> partition_start_columns;
size_t partition_start_row = 0;
// Data for computing the window functions.
std::vector<WindowFunctionWorkspace> workspaces;
std::unique_ptr<Arena> arena;
};
}

View File

@ -17,6 +17,7 @@ SRCS(
Executors/ExecutingGraph.cpp
Executors/PipelineExecutingBlockInputStream.cpp
Executors/PipelineExecutor.cpp
Executors/PollingQueue.cpp
Executors/PullingAsyncPipelineExecutor.cpp
Executors/PullingPipelineExecutor.cpp
ForkProcessor.cpp
@ -123,6 +124,7 @@ SRCS(
QueryPlan/SettingQuotaAndLimitsStep.cpp
QueryPlan/TotalsHavingStep.cpp
QueryPlan/UnionStep.cpp
QueryPlan/WindowStep.cpp
ResizeProcessor.cpp
Sources/DelayedSource.cpp
Sources/RemoteSource.cpp
@ -155,6 +157,7 @@ SRCS(
Transforms/RollupTransform.cpp
Transforms/SortingTransform.cpp
Transforms/TotalsHavingTransform.cpp
Transforms/WindowTransform.cpp
printPipeline.cpp
)

View File

@ -3,21 +3,22 @@
#include <sparsehash/dense_hash_map>
#include <sparsehash/dense_hash_set>
#include <Storages/AlterCommands.h>
#include <Common/StringUtils/StringUtils.h>
#include <Common/quoteString.h>
#include <IO/Operators.h>
#include <IO/WriteBufferFromString.h>
#include <Interpreters/Context.h>
#include <Interpreters/ExpressionActions.h>
#include <Interpreters/InterpreterSelectQuery.h>
#include <Parsers/ASTCreateQuery.h>
#include <Parsers/ASTSetQuery.h>
#include <Processors/Pipe.h>
#include <Processors/QueryPlan/ReadFromPreparedSource.h>
#include <Interpreters/Context.h>
#include <Common/StringUtils/StringUtils.h>
#include <Common/quoteString.h>
#include <Interpreters/ExpressionActions.h>
#include <Interpreters/InterpreterSelectQuery.h>
#include <Storages/AlterCommands.h>
namespace DB
{
namespace ErrorCodes
{
extern const int TABLE_IS_DROPPED;
@ -39,8 +40,9 @@ RWLockImpl::LockHolder IStorage::tryLockTimed(
{
const String type_str = type == RWLockImpl::Type::Read ? "READ" : "WRITE";
throw Exception(
type_str + " locking attempt on \"" + getStorageID().getFullTableName() +
"\" has timed out! (" + std::to_string(acquire_timeout.count()) + "ms) "
type_str + " locking attempt on \"" + getStorageID().getFullTableName() + "\" has timed out! ("
+ std::to_string(acquire_timeout.count())
+ "ms) "
"Possible deadlock avoided. Client should retry.",
ErrorCodes::DEADLOCK_AVOIDED);
}
@ -117,15 +119,12 @@ void IStorage::read(
}
Pipe IStorage::alterPartition(
const StorageMetadataPtr & /* metadata_snapshot */,
const PartitionCommands & /* commands */,
const Context & /* context */)
const StorageMetadataPtr & /* metadata_snapshot */, const PartitionCommands & /* commands */, const Context & /* context */)
{
throw Exception("Partition operations are not supported by storage " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
void IStorage::alter(
const AlterCommands & params, const Context & context, TableLockHolder &)
void IStorage::alter(const AlterCommands & params, const Context & context, TableLockHolder &)
{
auto table_id = getStorageID();
StorageInMemoryMetadata new_metadata = getInMemoryMetadata();
@ -146,7 +145,8 @@ void IStorage::checkAlterIsPossible(const AlterCommands & commands, const Settin
}
}
void IStorage::checkAlterPartitionIsPossible(const PartitionCommands & /*commands*/, const StorageMetadataPtr & /*metadata_snapshot*/, const Settings & /*settings*/) const
void IStorage::checkAlterPartitionIsPossible(
const PartitionCommands & /*commands*/, const StorageMetadataPtr & /*metadata_snapshot*/, const Settings & /*settings*/) const
{
throw Exception("Table engine " + getName() + " doesn't support partitioning", ErrorCodes::NOT_IMPLEMENTED);
}
@ -168,4 +168,52 @@ NamesAndTypesList IStorage::getVirtuals() const
return {};
}
Names IStorage::getAllRegisteredNames() const
{
Names result;
auto getter = [](const auto & column) { return column.name; };
const NamesAndTypesList & available_columns = getInMemoryMetadata().getColumns().getAllPhysical();
std::transform(available_columns.begin(), available_columns.end(), std::back_inserter(result), getter);
return result;
}
std::string PrewhereDAGInfo::dump() const
{
WriteBufferFromOwnString ss;
ss << "PrewhereDagInfo\n";
if (alias_actions)
{
ss << "alias_actions " << alias_actions->dumpDAG() << "\n";
}
if (prewhere_actions)
{
ss << "prewhere_actions " << prewhere_actions->dumpDAG() << "\n";
}
if (remove_columns_actions)
{
ss << "remove_columns_actions " << remove_columns_actions->dumpDAG() << "\n";
}
ss << "remove_prewhere_column " << remove_prewhere_column
<< ", need_filter " << need_filter << "\n";
return ss.str();
}
std::string FilterInfo::dump() const
{
WriteBufferFromOwnString ss;
ss << "FilterInfo for column '" << column_name <<"', do_remove_column "
<< do_remove_column << "\n";
if (actions_dag)
{
ss << "actions_dag " << actions_dag->dumpDAG() << "\n";
}
return ss.str();
}
}

View File

@ -78,7 +78,7 @@ struct ColumnSize
* - data storage structure (compression, etc.)
* - concurrent access to data (locks, etc.)
*/
class IStorage : public std::enable_shared_from_this<IStorage>, public TypePromotion<IStorage>
class IStorage : public std::enable_shared_from_this<IStorage>, public TypePromotion<IStorage>, public IHints<1, IStorage>
{
public:
IStorage() = delete;
@ -87,7 +87,6 @@ public:
: storage_id(std::move(storage_id_))
, metadata(std::make_unique<StorageInMemoryMetadata>()) {} //-V730
virtual ~IStorage() = default;
IStorage(const IStorage &) = delete;
IStorage & operator=(const IStorage &) = delete;
@ -172,6 +171,7 @@ public:
/// By default return empty list of columns.
virtual NamesAndTypesList getVirtuals() const;
Names getAllRegisteredNames() const override;
protected:
/// Returns whether the column is virtual - by default all columns are real.

View File

@ -2,6 +2,7 @@
#include <Interpreters/TreeRewriter.h>
#include <Storages/IndicesDescription.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTIndexDeclaration.h>
#include <Parsers/formatAST.h>
#include <Parsers/ParserCreateQuery.h>

View File

@ -125,7 +125,6 @@ Block KafkaBlockInputStream::readImpl()
}
case IProcessor::Status::NeedData:
case IProcessor::Status::Async:
case IProcessor::Status::Wait:
case IProcessor::Status::ExpandPipeline:
throw Exception("Source processor returned status " + IProcessor::statusToName(status), ErrorCodes::LOGICAL_ERROR);
}

View File

@ -10,7 +10,6 @@
#include <Storages/SelectQueryInfo.h>
#include <Storages/MergeTree/MarkRange.h>
#include <Interpreters/ExpressionActions.h>
#include <Parsers/ASTIndexDeclaration.h>
#include <DataTypes/DataTypeLowCardinality.h>
constexpr auto INDEX_FILE_PREFIX = "skp_idx_";

View File

@ -111,9 +111,6 @@ struct Settings;
M(Bool, remove_empty_parts, true, "Remove empty parts after they were pruned by TTL, mutation, or collapsing merge algorithm", 0) \
M(Bool, assign_part_uuids, false, "Generate UUIDs for parts. Before enabling check that all replicas support new format.", 0) \
\
/** Settings for testing purposes */ \
M(Bool, randomize_part_type, false, "For testing purposes only. Randomizes part type between wide and compact", 0) \
\
/** Obsolete settings. Kept for backward compatibility only. */ \
M(UInt64, min_relative_delay_to_yield_leadership, 120, "Obsolete setting, does nothing.", 0) \
M(UInt64, check_delay_period, 60, "Obsolete setting, does nothing.", 0) \

View File

@ -234,25 +234,6 @@ If you use the Replicated version of engines, see https://clickhouse.tech/docs/e
}
static void randomizePartTypeSettings(const std::unique_ptr<MergeTreeSettings> & storage_settings)
{
static constexpr auto MAX_THRESHOLD_FOR_ROWS = 100000;
static constexpr auto MAX_THRESHOLD_FOR_BYTES = 1024 * 1024 * 10;
/// Create all parts in wide format with probability 1/3.
if (thread_local_rng() % 3 == 0)
{
storage_settings->min_rows_for_wide_part = 0;
storage_settings->min_bytes_for_wide_part = 0;
}
else
{
storage_settings->min_rows_for_wide_part = std::uniform_int_distribution{0, MAX_THRESHOLD_FOR_ROWS}(thread_local_rng);
storage_settings->min_bytes_for_wide_part = std::uniform_int_distribution{0, MAX_THRESHOLD_FOR_BYTES}(thread_local_rng);
}
}
static StoragePtr create(const StorageFactory::Arguments & args)
{
/** [Replicated][|Summing|Collapsing|Aggregating|Replacing|Graphite]MergeTree (2 * 7 combinations) engines
@ -737,20 +718,6 @@ static StoragePtr create(const StorageFactory::Arguments & args)
++arg_num;
}
/// Allow to randomize part type for tests to cover more cases.
/// But if settings were set explicitly restrict it.
if (storage_settings->randomize_part_type
&& !storage_settings->min_rows_for_wide_part.changed
&& !storage_settings->min_bytes_for_wide_part.changed)
{
randomizePartTypeSettings(storage_settings);
LOG_INFO(&Poco::Logger::get(args.table_id.getNameForLogs() + " (registerStorageMergeTree)"),
"Applied setting 'randomize_part_type'. "
"Setting 'min_rows_for_wide_part' changed to {}. "
"Setting 'min_bytes_for_wide_part' changed to {}.",
storage_settings->min_rows_for_wide_part, storage_settings->min_bytes_for_wide_part);
}
if (arg_num != arg_cnt)
throw Exception("Wrong number of engine arguments.", ErrorCodes::BAD_ARGUMENTS);

Some files were not shown because too many files have changed in this diff Show More