From 7b4e0e5bd853a8f7887785bd96edc330cf94451d Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Wed, 23 Nov 2022 23:37:02 +0100 Subject: [PATCH 1/4] Properly handle EINTR; some more comments --- programs/server/Server.cpp | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 4702985c0ae..6c5adb1cb8e 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1,5 +1,6 @@ #include "Server.h" +#include #include #include #include @@ -652,7 +653,8 @@ static void sanityChecks(Server & server) } #if defined(OS_LINUX) -/// Sends notification to systemd, analogous to sd_notify from libsystemd +/// Sends notification to systemd, analogous to sd_notify from libsystemd. +/// See https://man7.org/linux/man-pages/man3/sd_notify.3.html for more information on the supported notifications. static void systemdNotify(const std::string_view & command) { const char * path = getenv("NOTIFY_SOCKET"); // NOLINT(concurrency-mt-unsafe) @@ -680,7 +682,7 @@ static void systemdNotify(const std::string_view & command) size_t addrlen = offsetof(struct sockaddr_un, sun_path) + len; - /// '@' meass this is Linux abstract socket, per documentation it must be sun_path[0] must be set to '\0' for it. + /// '@' means this is Linux abstract socket, per documentation sun_path[0] must be set to '\0' for it. if (path[0] == '@') addr.sun_path[0] = 0; else if (path[0] == '/') @@ -690,9 +692,20 @@ static void systemdNotify(const std::string_view & command) const struct sockaddr *sock_addr = reinterpret_cast (&addr); - if (sendto(s, command.data(), command.size(), 0, sock_addr, static_cast (addrlen)) != static_cast (command.size())) - throw Exception("Failed to notify systemd.", ErrorCodes::SYSTEM_ERROR); - + size_t sent_bytes_total = 0; + while (sent_bytes_total < command.size()) + { + auto sent_bytes = sendto(s, command.data() + sent_bytes_total, command.size() - sent_bytes_total, 0, sock_addr, static_cast(addrlen)); + if (sent_bytes == -1) + { + if (errno == EINTR) + continue; + else + throwFromErrno("Failed to notify systemd, sendto returned error.", ErrorCodes::SYSTEM_ERROR); + } + else + sent_bytes_total += sent_bytes; + } } #endif @@ -1831,6 +1844,7 @@ try } #if defined(OS_LINUX) + /// Tell the service manager that service startup is finished. systemdNotify("READY=1\n"); #endif From 688e6fe714b1ef890d21e0671faeba7f1e189219 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Mon, 5 Dec 2022 22:20:04 +0100 Subject: [PATCH 2/4] Send MAINPID= notification from the parent (watchdog) process to make systemd handle READY=1 notifiaction from the child --- packages/clickhouse-server.service | 4 +- programs/server/Server.cpp | 63 ++--------------------------- src/Daemon/BaseDaemon.cpp | 65 ++++++++++++++++++++++++++++++ src/Daemon/BaseDaemon.h | 6 +++ 4 files changed, 77 insertions(+), 61 deletions(-) diff --git a/packages/clickhouse-server.service b/packages/clickhouse-server.service index 1581b95213e..a1602482073 100644 --- a/packages/clickhouse-server.service +++ b/packages/clickhouse-server.service @@ -11,8 +11,8 @@ Wants=time-sync.target [Service] Type=notify -# Switching off watchdog is very important for sd_notify to work correctly. -Environment=CLICKHOUSE_WATCHDOG_ENABLE=0 +# NOTE: we leave clickhouse watchdog process enabled to be able to see OOM/SIGKILL traces in clickhouse-server.log files. +# If you wish to disable the watchdog and rely on systemd logs just add "Environment=CLICKHOUSE_WATCHDOG_ENABLE=0" line. User=clickhouse Group=clickhouse Restart=always diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 03870f59abc..1c82bfedae4 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1,6 +1,5 @@ #include "Server.h" -#include #include #include #include @@ -288,7 +287,6 @@ namespace ErrorCodes extern const int MISMATCHING_USERS_FOR_PROCESS_AND_DATA; extern const int NETWORK_ERROR; extern const int CORRUPTED_DATA; - extern const int SYSTEM_ERROR; } @@ -662,63 +660,6 @@ static void sanityChecks(Server & server) } } -#if defined(OS_LINUX) -/// Sends notification to systemd, analogous to sd_notify from libsystemd. -/// See https://man7.org/linux/man-pages/man3/sd_notify.3.html for more information on the supported notifications. -static void systemdNotify(const std::string_view & command) -{ - const char * path = getenv("NOTIFY_SOCKET"); // NOLINT(concurrency-mt-unsafe) - - if (path == nullptr) - return; /// not using systemd - - int s = socket(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0); - - if (s == -1) - throwFromErrno("Can't create UNIX socket for systemd notify.", ErrorCodes::SYSTEM_ERROR); - - SCOPE_EXIT({ close(s); }); - - const size_t len = strlen(path); - - struct sockaddr_un addr; - - addr.sun_family = AF_UNIX; - - if (len < 2 || len > sizeof(addr.sun_path) - 1) - throw Exception(ErrorCodes::SYSTEM_ERROR, "NOTIFY_SOCKET env var value \"{}\" is wrong.", path); - - memcpy(addr.sun_path, path, len + 1); /// write last zero as well. - - size_t addrlen = offsetof(struct sockaddr_un, sun_path) + len; - - /// '@' means this is Linux abstract socket, per documentation sun_path[0] must be set to '\0' for it. - if (path[0] == '@') - addr.sun_path[0] = 0; - else if (path[0] == '/') - addrlen += 1; /// non-abstract-addresses should be zero terminated. - else - throw Exception(ErrorCodes::SYSTEM_ERROR, "Wrong UNIX path \"{}\" in NOTIFY_SOCKET env var", path); - - const struct sockaddr *sock_addr = reinterpret_cast (&addr); - - size_t sent_bytes_total = 0; - while (sent_bytes_total < command.size()) - { - auto sent_bytes = sendto(s, command.data() + sent_bytes_total, command.size() - sent_bytes_total, 0, sock_addr, static_cast(addrlen)); - if (sent_bytes == -1) - { - if (errno == EINTR) - continue; - else - throwFromErrno("Failed to notify systemd, sendto returned error.", ErrorCodes::SYSTEM_ERROR); - } - else - sent_bytes_total += sent_bytes; - } -} -#endif - int Server::main(const std::vector & /*args*/) try { @@ -1857,6 +1798,10 @@ try #if defined(OS_LINUX) /// Tell the service manager that service startup is finished. + /// NOTE: the parent clickhouse-watchdog process must do systemdNotify("MAINPID={}\n", child_pid); before + /// the child process notifies 'READY=1'. So there is a possibility of a race condition but the chances should be low + /// because the parent process does 'MAINPID=...' straight away after fork() and the child process does the actual + /// initialization and only then sends 'READY=1'. systemdNotify("READY=1\n"); #endif diff --git a/src/Daemon/BaseDaemon.cpp b/src/Daemon/BaseDaemon.cpp index 7283973007b..c99768ba57b 100644 --- a/src/Daemon/BaseDaemon.cpp +++ b/src/Daemon/BaseDaemon.cpp @@ -76,6 +76,7 @@ namespace DB { extern const int CANNOT_SET_SIGNAL_HANDLER; extern const int CANNOT_SEND_SIGNAL; + extern const int SYSTEM_ERROR; } } @@ -1013,6 +1014,15 @@ void BaseDaemon::setupWatchdog() return; } +#if defined(OS_LINUX) + /// Tell the service manager the actual main process is not this one but the forked process + /// because it is going to be serving the requests and it is going to send "READY=1" notification + /// when it is fully started. + /// NOTE: we do this right after fork() to minimize chances that the child process finishes initialization + /// and sends "READY=1" before we send "MAINPID=..." + systemdNotify(fmt::format("MAINPID={}\n", pid)); +#endif + /// Change short thread name and process name. setThreadName("clckhouse-watch"); /// 15 characters @@ -1131,3 +1141,58 @@ String BaseDaemon::getStoredBinaryHash() const { return stored_binary_hash; } + +#if defined(OS_LINUX) +void systemdNotify(const std::string_view & command) +{ + const char * path = getenv("NOTIFY_SOCKET"); // NOLINT(concurrency-mt-unsafe) + + if (path == nullptr) + return; /// not using systemd + + int s = socket(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0); + + if (s == -1) + DB::throwFromErrno("Can't create UNIX socket for systemd notify.", DB::ErrorCodes::SYSTEM_ERROR); + + SCOPE_EXIT({ close(s); }); + + const size_t len = strlen(path); + + struct sockaddr_un addr; + + addr.sun_family = AF_UNIX; + + if (len < 2 || len > sizeof(addr.sun_path) - 1) + throw DB::Exception(DB::ErrorCodes::SYSTEM_ERROR, "NOTIFY_SOCKET env var value \"{}\" is wrong.", path); + + memcpy(addr.sun_path, path, len + 1); /// write last zero as well. + + size_t addrlen = offsetof(struct sockaddr_un, sun_path) + len; + + /// '@' means this is Linux abstract socket, per documentation sun_path[0] must be set to '\0' for it. + if (path[0] == '@') + addr.sun_path[0] = 0; + else if (path[0] == '/') + addrlen += 1; /// non-abstract-addresses should be zero terminated. + else + throw DB::Exception(DB::ErrorCodes::SYSTEM_ERROR, "Wrong UNIX path \"{}\" in NOTIFY_SOCKET env var", path); + + const struct sockaddr *sock_addr = reinterpret_cast (&addr); + + size_t sent_bytes_total = 0; + while (sent_bytes_total < command.size()) + { + auto sent_bytes = sendto(s, command.data() + sent_bytes_total, command.size() - sent_bytes_total, 0, sock_addr, static_cast(addrlen)); + if (sent_bytes == -1) + { + if (errno == EINTR) + continue; + else + DB::throwFromErrno("Failed to notify systemd, sendto returned error.", DB::ErrorCodes::SYSTEM_ERROR); + } + else + sent_bytes_total += sent_bytes; + } +} +#endif diff --git a/src/Daemon/BaseDaemon.h b/src/Daemon/BaseDaemon.h index ae64651caed..cb4aa0c2da6 100644 --- a/src/Daemon/BaseDaemon.h +++ b/src/Daemon/BaseDaemon.h @@ -197,3 +197,9 @@ std::optional> BaseDaemon::tryGetInstance() else return {}; } + +#if defined(OS_LINUX) +/// Sends notification (e.g. "server is ready") to systemd, analogous to sd_notify from libsystemd. +/// See https://www.freedesktop.org/software/systemd/man/sd_notify.html for more information on the supported notifications. +void systemdNotify(const std::string_view & command); +#endif From 3608763a30810ba93d9e2842efdc59069598aedd Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Tue, 6 Dec 2022 15:17:43 +0100 Subject: [PATCH 3/4] Wait for parent process to send MAINPID=... notification to the service manager --- programs/server/Server.cpp | 4 +--- src/Daemon/BaseDaemon.cpp | 22 ++++++++++++++++++++-- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 1c82bfedae4..83103a2cb4f 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1799,9 +1799,7 @@ try #if defined(OS_LINUX) /// Tell the service manager that service startup is finished. /// NOTE: the parent clickhouse-watchdog process must do systemdNotify("MAINPID={}\n", child_pid); before - /// the child process notifies 'READY=1'. So there is a possibility of a race condition but the chances should be low - /// because the parent process does 'MAINPID=...' straight away after fork() and the child process does the actual - /// initialization and only then sends 'READY=1'. + /// the child process notifies 'READY=1'. systemdNotify("READY=1\n"); #endif diff --git a/src/Daemon/BaseDaemon.cpp b/src/Daemon/BaseDaemon.cpp index c99768ba57b..520294ca909 100644 --- a/src/Daemon/BaseDaemon.cpp +++ b/src/Daemon/BaseDaemon.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -998,6 +999,10 @@ void BaseDaemon::setupWatchdog() while (true) { + /// This pipe is used to synchronize notifications to the service manager from the child process + /// to be sent after the notifications from the parent process. + Poco::Pipe notify_sync; + static pid_t pid = -1; pid = fork(); @@ -1011,6 +1016,15 @@ void BaseDaemon::setupWatchdog() if (0 != prctl(PR_SET_PDEATHSIG, SIGKILL)) logger().warning("Cannot do prctl to ask termination with parent."); #endif + { + notify_sync.close(Poco::Pipe::CLOSE_WRITE); + /// Read from the pipe will block until the pipe is closed. + /// This way we synchronize with the parent process. + char buf[1]; + if (0 != notify_sync.readBytes(buf, sizeof(buf))) + throw Poco::Exception("Unexpected result while waiting for watchdog synchronization pipe to close."); + } + return; } @@ -1018,11 +1032,15 @@ void BaseDaemon::setupWatchdog() /// Tell the service manager the actual main process is not this one but the forked process /// because it is going to be serving the requests and it is going to send "READY=1" notification /// when it is fully started. - /// NOTE: we do this right after fork() to minimize chances that the child process finishes initialization - /// and sends "READY=1" before we send "MAINPID=..." + /// NOTE: we do this right after fork() and then notify the child process to "unblock" so that it finishes initialization + /// and sends "READY=1" after we have sent "MAINPID=..." systemdNotify(fmt::format("MAINPID={}\n", pid)); #endif + /// Close the pipe after notifying the service manager. + /// The child process is waiting for the pipe to be closed. + notify_sync.close(); + /// Change short thread name and process name. setThreadName("clckhouse-watch"); /// 15 characters From b62ca7b4e11f316ed2792db79482132a37c66377 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Tue, 6 Dec 2022 16:49:35 +0100 Subject: [PATCH 4/4] Some more fixes in startup logic --- src/Daemon/BaseDaemon.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/Daemon/BaseDaemon.cpp b/src/Daemon/BaseDaemon.cpp index 520294ca909..1a36a81cbfb 100644 --- a/src/Daemon/BaseDaemon.cpp +++ b/src/Daemon/BaseDaemon.cpp @@ -1007,7 +1007,7 @@ void BaseDaemon::setupWatchdog() pid = fork(); if (-1 == pid) - throw Poco::Exception("Cannot fork"); + DB::throwFromErrno("Cannot fork", DB::ErrorCodes::SYSTEM_ERROR); if (0 == pid) { @@ -1015,7 +1015,11 @@ void BaseDaemon::setupWatchdog() #if defined(OS_LINUX) if (0 != prctl(PR_SET_PDEATHSIG, SIGKILL)) logger().warning("Cannot do prctl to ask termination with parent."); + + if (getppid() == 1) + throw Poco::Exception("Parent watchdog process has exited."); #endif + { notify_sync.close(Poco::Pipe::CLOSE_WRITE); /// Read from the pipe will block until the pipe is closed.