From 9cdd56abbc4c75936fb4456d741b012fd710f98a Mon Sep 17 00:00:00 2001 From: Pablo Marcos Date: Tue, 5 Nov 2024 14:02:43 +0000 Subject: [PATCH] Reduce the general critical section for query_metric_log - Use a separate mutex for each query to reduce the contention period for queries_mutex. - Refactor to use std::mutex instead of std::recursive_mutex for queries_mutex. - In case we're running late to schedule the next task, schedule it immediately. - Fix LockGuard because unlocking twice is undefined behavior. --- base/base/defines.h | 1 + src/Common/LockGuard.h | 32 +++++- src/Interpreters/QueryMetricLog.cpp | 165 +++++++++++++++++++--------- src/Interpreters/QueryMetricLog.h | 43 ++++++-- 4 files changed, 179 insertions(+), 62 deletions(-) diff --git a/base/base/defines.h b/base/base/defines.h index 5685a6d9833..a0c3c0d1de5 100644 --- a/base/base/defines.h +++ b/base/base/defines.h @@ -145,6 +145,7 @@ #define TSA_TRY_ACQUIRE_SHARED(...) __attribute__((try_acquire_shared_capability(__VA_ARGS__))) /// function tries to acquire a shared capability and returns a boolean value indicating success or failure #define TSA_RELEASE_SHARED(...) __attribute__((release_shared_capability(__VA_ARGS__))) /// function releases the given shared capability #define TSA_SCOPED_LOCKABLE __attribute__((scoped_lockable)) /// object of a class has scoped lockable capability +#define TSA_RETURN_CAPABILITY(...) __attribute__((lock_returned(__VA_ARGS__))) /// to return capabilities in functions /// Macros for suppressing TSA warnings for specific reads/writes (instead of suppressing it for the whole function) /// They use a lambda function to apply function attribute to a single statement. This enable us to suppress warnings locally instead of diff --git a/src/Common/LockGuard.h b/src/Common/LockGuard.h index 8a98c5f553a..03c8a3e7617 100644 --- a/src/Common/LockGuard.h +++ b/src/Common/LockGuard.h @@ -1,23 +1,47 @@ #pragma once -#include #include +#include +#include namespace DB { +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +}; + /** LockGuard provides RAII-style locking mechanism for a mutex. - ** It's intended to be used like std::unique_ptr but with TSA annotations + ** It's intended to be used like std::unique_lock but with TSA annotations */ template class TSA_SCOPED_LOCKABLE LockGuard { public: - explicit LockGuard(Mutex & mutex_) TSA_ACQUIRE(mutex_) : mutex(mutex_) { mutex.lock(); } - ~LockGuard() TSA_RELEASE() { mutex.unlock(); } + explicit LockGuard(Mutex & mutex_) TSA_ACQUIRE(mutex_) : mutex(mutex_) { lock(); } + ~LockGuard() TSA_RELEASE() { if (locked) unlock(); } + + void lock() TSA_ACQUIRE() + { + /// Don't allow recursive_mutex for now. + if (locked) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't lock twice the same mutex"); + mutex.lock(); + locked = true; + } + + void unlock() TSA_RELEASE() + { + if (!locked) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't unlock the mutex without locking it first"); + mutex.unlock(); + locked = false; + } private: Mutex & mutex; + bool locked = false; }; template typename TLockGuard, typename Mutex> diff --git a/src/Interpreters/QueryMetricLog.cpp b/src/Interpreters/QueryMetricLog.cpp index 5ab3fe590e0..e784c357b29 100644 --- a/src/Interpreters/QueryMetricLog.cpp +++ b/src/Interpreters/QueryMetricLog.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -16,7 +17,6 @@ #include #include -#include namespace DB @@ -24,6 +24,20 @@ namespace DB static auto logger = getLogger("QueryMetricLog"); +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +}; + +String timePointToString(QueryMetricLog::TimePoint time) +{ + /// fmtlib supports subsecond formatting in 10.0.0. We're in 9.1.0, so we need to add the milliseconds ourselves. + auto seconds = std::chrono::time_point_cast(time); + auto microseconds = std::chrono::duration_cast(time - seconds).count(); + + return fmt::format("{:%Y.%m.%d %H:%M:%S}.{:06}", seconds, microseconds); +} + ColumnsDescription QueryMetricLogElement::getColumnsDescription() { ColumnsDescription result; @@ -87,36 +101,69 @@ void QueryMetricLog::shutdown() Base::shutdown(); } -void QueryMetricLog::startQuery(const String & query_id, TimePoint start_time, UInt64 interval_milliseconds) +void QueryMetricLog::collectMetric(const ProcessList & process_list, String query_id) { - QueryMetricLogStatus status; - status.interval_milliseconds = interval_milliseconds; - status.next_collect_time = start_time + std::chrono::milliseconds(interval_milliseconds); + auto current_time = std::chrono::system_clock::now(); + const auto query_info = process_list.getQueryInfo(query_id, false, true, false); + if (!query_info) + { + LOG_TRACE(logger, "Query {} is not running anymore, so we couldn't get its QueryStatusInfo", query_id); + return; + } + + LockGuard global_lock(queries_mutex); + auto it = queries.find(query_id); + + /// The query might have finished while the scheduled task is running. + if (it == queries.end()) + { + global_lock.unlock(); + LOG_TRACE(logger, "Query {} not found in the list. Finished while this collecting task was running", query_id); + return; + } + + auto & query_status = it->second; + if (!query_status.mutex) + { + global_lock.unlock(); + LOG_TRACE(logger, "Query {} finished while this collecting task was running", query_id); + return; + } + + LockGuard query_lock(query_status.getMutex()); + global_lock.unlock(); + + auto elem = query_status.createLogMetricElement(query_id, *query_info, current_time); + if (elem) + add(std::move(elem.value())); +} + +/// We use TSA_NO_THREAD_SAFETY_ANALYSIS to prevent TSA complaining that we're modifying the query_status fields +/// without locking the mutex. Since we're building it from scratch, there's no harm in not holding it. +/// If we locked it to make TSA happy, TSAN build would falsely complain about +/// lock-order-inversion (potential deadlock) +/// which is not a real issue since QueryMetricLogStatus's mutex cannot be locked by anything else +/// until we add it to the queries map. +void QueryMetricLog::startQuery(const String & query_id, TimePoint start_time, UInt64 interval_milliseconds) TSA_NO_THREAD_SAFETY_ANALYSIS +{ + QueryMetricLogStatus query_status; + query_status.interval_milliseconds = interval_milliseconds; + query_status.next_collect_time = start_time + std::chrono::milliseconds(interval_milliseconds); auto context = getContext(); const auto & process_list = context->getProcessList(); - status.task = context->getSchedulePool().createTask("QueryMetricLog", [this, &process_list, query_id] { - auto current_time = std::chrono::system_clock::now(); - const auto query_info = process_list.getQueryInfo(query_id, false, true, false); - if (!query_info) - { - LOG_TRACE(logger, "Query {} is not running anymore, so we couldn't get its QueryStatusInfo", query_id); - return; - } - - auto elem = createLogMetricElement(query_id, *query_info, current_time); - if (elem) - add(std::move(elem.value())); + query_status.task = context->getSchedulePool().createTask("QueryMetricLog", [this, &process_list, query_id] { + collectMetric(process_list, query_id); }); - std::lock_guard lock(queries_mutex); - status.task->scheduleAfter(interval_milliseconds); - queries.emplace(query_id, std::move(status)); + LockGuard global_lock(queries_mutex); + query_status.scheduleNext(query_id); + queries.emplace(query_id, std::move(query_status)); } void QueryMetricLog::finishQuery(const String & query_id, TimePoint finish_time, QueryStatusInfoPtr query_info) { - std::unique_lock lock(queries_mutex); + LockGuard global_lock(queries_mutex); auto it = queries.find(query_id); /// finishQuery may be called from logExceptionBeforeStart when the query has not even started @@ -124,9 +171,19 @@ void QueryMetricLog::finishQuery(const String & query_id, TimePoint finish_time, if (it == queries.end()) return; + auto & query_status = it->second; + decltype(query_status.mutex) query_mutex; + LockGuard query_lock(query_status.getMutex()); + + /// Move the query mutex here so that we hold it until the end, after removing the query from queries. + query_mutex = std::move(query_status.mutex); + query_status.mutex = {}; + + global_lock.unlock(); + if (query_info) { - auto elem = createLogMetricElement(query_id, *query_info, finish_time, false); + auto elem = query_status.createLogMetricElement(query_id, *query_info, finish_time, false); if (elem) add(std::move(elem.value())); } @@ -139,51 +196,62 @@ void QueryMetricLog::finishQuery(const String & query_id, TimePoint finish_time, /// that order. { /// Take ownership of the task so that we can destroy it in this scope after unlocking `queries_mutex`. - auto task = std::move(it->second.task); + auto task = std::move(query_status.task); /// Build an empty task for the old task to make sure it does not lock any mutex on its destruction. - it->second.task = {}; + query_status.task = {}; + query_lock.unlock(); + global_lock.lock(); queries.erase(query_id); /// Ensure `queries_mutex` is unlocked before calling task's destructor at the end of this /// scope which will lock `exec_mutex`. - lock.unlock(); + global_lock.unlock(); } } -std::optional QueryMetricLog::createLogMetricElement(const String & query_id, const QueryStatusInfo & query_info, TimePoint query_info_time, bool schedule_next) +void QueryMetricLogStatus::scheduleNext(String query_id) { - /// fmtlib supports subsecond formatting in 10.0.0. We're in 9.1.0, so we need to add the milliseconds ourselves. - auto seconds = std::chrono::time_point_cast(query_info_time); - auto microseconds = std::chrono::duration_cast(query_info_time - seconds).count(); - LOG_DEBUG(logger, "Collecting query_metric_log for query {} with QueryStatusInfo from {:%Y.%m.%d %H:%M:%S}.{:06}. Schedule next: {}", query_id, seconds, microseconds, schedule_next); - - std::unique_lock lock(queries_mutex); - auto query_status_it = queries.find(query_id); - - /// The query might have finished while the scheduled task is running. - if (query_status_it == queries.end()) + const auto now = std::chrono::system_clock::now(); + if (next_collect_time > now) { - lock.unlock(); - LOG_TRACE(logger, "Query {} finished already while this collecting task was running", query_id); - return {}; + const auto wait_time = std::chrono::duration_cast(next_collect_time - now).count(); + task->scheduleAfter(wait_time); } - - auto & query_status = query_status_it->second; - if (query_info_time <= query_status.last_collect_time) + else + { + LOG_TRACE(logger, "The next collecting task for query {} should have already run at {}. Scheduling it right now", + query_id, timePointToString(next_collect_time)); + task->schedule(); + } +} + +std::optional QueryMetricLogStatus::createLogMetricElement(const String & query_id, const QueryStatusInfo & query_info, TimePoint query_info_time, bool schedule_next) +{ + LOG_TRACE(logger, "Collecting query_metric_log for query {} and interval {} ms with QueryStatusInfo from {}. Schedule next: {}", + query_id, interval_milliseconds, timePointToString(query_info_time), schedule_next); + + if (query_info_time <= last_collect_time) { - lock.unlock(); LOG_TRACE(logger, "Query {} has a more recent metrics collected. Skipping this one", query_id); return {}; } - query_status.last_collect_time = query_info_time; + /// Leave some margin because task->scheduleAfter takes a value in milliseconds. + /// So, we can expect up to 1ms of drift since BackgroundSchedulePool will compare + /// time points in milliseconds. + static auto error_margin = std::chrono::milliseconds(1); + if (schedule_next && query_info_time + error_margin < next_collect_time) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Task to collect metric for query {} scheduled at {} but run at {}", + query_id, timePointToString(next_collect_time), timePointToString(query_info_time)); + + last_collect_time = query_info_time; QueryMetricLogElement elem; elem.event_time = timeInSeconds(query_info_time); elem.event_time_microseconds = timeInMicroseconds(query_info_time); - elem.query_id = query_status_it->first; + elem.query_id = query_id; elem.memory_usage = query_info.memory_usage > 0 ? query_info.memory_usage : 0; elem.peak_memory_usage = query_info.peak_memory_usage > 0 ? query_info.peak_memory_usage : 0; @@ -192,7 +260,7 @@ std::optional QueryMetricLog::createLogMetricElement(cons for (ProfileEvents::Event i = ProfileEvents::Event(0), end = ProfileEvents::end(); i < end; ++i) { const auto & new_value = (*(query_info.profile_counters))[i]; - auto & old_value = query_status.last_profile_events[i]; + auto & old_value = last_profile_events[i]; /// Profile event counters are supposed to be monotonic. However, at least the `NetworkReceiveBytes` can be inaccurate. /// So, since in the future the counter should always have a bigger value than in the past, we skip this event. @@ -214,9 +282,8 @@ std::optional QueryMetricLog::createLogMetricElement(cons if (schedule_next) { - query_status.next_collect_time += std::chrono::milliseconds(query_status.interval_milliseconds); - const auto wait_time = std::chrono::duration_cast(query_status.next_collect_time - std::chrono::system_clock::now()).count(); - query_status.task->scheduleAfter(wait_time); + next_collect_time += std::chrono::milliseconds(interval_milliseconds); + scheduleNext(query_id); } return elem; diff --git a/src/Interpreters/QueryMetricLog.h b/src/Interpreters/QueryMetricLog.h index 802cee7bf26..65764229b0a 100644 --- a/src/Interpreters/QueryMetricLog.h +++ b/src/Interpreters/QueryMetricLog.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -11,11 +12,17 @@ #include #include +#include namespace DB { +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +}; + /** QueryMetricLogElement is a log of query metric values measured at regular time interval. */ @@ -36,31 +43,49 @@ struct QueryMetricLogElement struct QueryMetricLogStatus { + using TimePoint = std::chrono::system_clock::time_point; + using Mutex = std::mutex; + UInt64 interval_milliseconds; - std::chrono::system_clock::time_point last_collect_time; - std::chrono::system_clock::time_point next_collect_time; - std::vector last_profile_events = std::vector(ProfileEvents::end()); - BackgroundSchedulePool::TaskHolder task; + std::chrono::system_clock::time_point last_collect_time TSA_GUARDED_BY(getMutex()); + std::chrono::system_clock::time_point next_collect_time TSA_GUARDED_BY(getMutex()); + std::vector last_profile_events TSA_GUARDED_BY(getMutex()) = std::vector(ProfileEvents::end()); + BackgroundSchedulePool::TaskHolder task TSA_GUARDED_BY(getMutex()); + + /// We need to be able to move it for the hash map, so we need to add an indirection here. + std::unique_ptr mutex = std::make_unique(); + + /// Return a reference to the mutex, used for Thread Sanitizer annotations. + Mutex & getMutex() const TSA_RETURN_CAPABILITY(mutex) + { + if (!mutex) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Mutex cannot be NULL"); + return *mutex; + } + + void scheduleNext(String query_id) TSA_REQUIRES(getMutex()); + std::optional createLogMetricElement(const String & query_id, const QueryStatusInfo & query_info, TimePoint query_info_time, bool schedule_next = true) TSA_REQUIRES(getMutex()); }; class QueryMetricLog : public SystemLog { using SystemLog::SystemLog; - using TimePoint = std::chrono::system_clock::time_point; using Base = SystemLog; public: + using TimePoint = std::chrono::system_clock::time_point; + void shutdown() final; - // Both startQuery and finishQuery are called from the thread that executes the query + /// Both startQuery and finishQuery are called from the thread that executes the query. void startQuery(const String & query_id, TimePoint start_time, UInt64 interval_milliseconds); void finishQuery(const String & query_id, TimePoint finish_time, QueryStatusInfoPtr query_info = nullptr); private: - std::optional createLogMetricElement(const String & query_id, const QueryStatusInfo & query_info, TimePoint query_info_time, bool schedule_next = true); + void collectMetric(const ProcessList & process_list, String query_id); - std::recursive_mutex queries_mutex; - std::unordered_map queries; + std::mutex queries_mutex; + std::unordered_map queries TSA_GUARDED_BY(queries_mutex); }; }