Reduce the general critical section for query_metric_log

- Use a separate mutex for each query to reduce the contention
period for queries_mutex.

- Refactor to use std::mutex instead of std::recursive_mutex
for queries_mutex.

- In case we're running late to schedule the next task, schedule
it immediately.

- Fix LockGuard because unlocking twice is undefined behavior.
This commit is contained in:
Pablo Marcos 2024-11-05 14:02:43 +00:00
parent bac948ec0e
commit 9cdd56abbc
4 changed files with 179 additions and 62 deletions

View File

@ -145,6 +145,7 @@
#define TSA_TRY_ACQUIRE_SHARED(...) __attribute__((try_acquire_shared_capability(__VA_ARGS__))) /// function tries to acquire a shared capability and returns a boolean value indicating success or failure
#define TSA_RELEASE_SHARED(...) __attribute__((release_shared_capability(__VA_ARGS__))) /// function releases the given shared capability
#define TSA_SCOPED_LOCKABLE __attribute__((scoped_lockable)) /// object of a class has scoped lockable capability
#define TSA_RETURN_CAPABILITY(...) __attribute__((lock_returned(__VA_ARGS__))) /// to return capabilities in functions
/// Macros for suppressing TSA warnings for specific reads/writes (instead of suppressing it for the whole function)
/// They use a lambda function to apply function attribute to a single statement. This enable us to suppress warnings locally instead of

View File

@ -1,23 +1,47 @@
#pragma once
#include <Common/OvercommitTracker.h>
#include <base/defines.h>
#include <Common/Exception.h>
#include <Common/OvercommitTracker.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
};
/** LockGuard provides RAII-style locking mechanism for a mutex.
** It's intended to be used like std::unique_ptr but with TSA annotations
** It's intended to be used like std::unique_lock but with TSA annotations
*/
template <typename Mutex>
class TSA_SCOPED_LOCKABLE LockGuard
{
public:
explicit LockGuard(Mutex & mutex_) TSA_ACQUIRE(mutex_) : mutex(mutex_) { mutex.lock(); }
~LockGuard() TSA_RELEASE() { mutex.unlock(); }
explicit LockGuard(Mutex & mutex_) TSA_ACQUIRE(mutex_) : mutex(mutex_) { lock(); }
~LockGuard() TSA_RELEASE() { if (locked) unlock(); }
void lock() TSA_ACQUIRE()
{
/// Don't allow recursive_mutex for now.
if (locked)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't lock twice the same mutex");
mutex.lock();
locked = true;
}
void unlock() TSA_RELEASE()
{
if (!locked)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't unlock the mutex without locking it first");
mutex.unlock();
locked = false;
}
private:
Mutex & mutex;
bool locked = false;
};
template <template<typename> typename TLockGuard, typename Mutex>

View File

@ -1,6 +1,7 @@
#include <base/getFQDNOrHostName.h>
#include <Common/DateLUT.h>
#include <Common/DateLUTImpl.h>
#include <Common/LockGuard.h>
#include <DataTypes/DataTypeDate.h>
#include <DataTypes/DataTypeDateTime.h>
#include <DataTypes/DataTypeDateTime64.h>
@ -16,7 +17,6 @@
#include <chrono>
#include <fmt/chrono.h>
#include <mutex>
namespace DB
@ -24,6 +24,20 @@ namespace DB
static auto logger = getLogger("QueryMetricLog");
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
};
String timePointToString(QueryMetricLog::TimePoint time)
{
/// fmtlib supports subsecond formatting in 10.0.0. We're in 9.1.0, so we need to add the milliseconds ourselves.
auto seconds = std::chrono::time_point_cast<std::chrono::seconds>(time);
auto microseconds = std::chrono::duration_cast<std::chrono::microseconds>(time - seconds).count();
return fmt::format("{:%Y.%m.%d %H:%M:%S}.{:06}", seconds, microseconds);
}
ColumnsDescription QueryMetricLogElement::getColumnsDescription()
{
ColumnsDescription result;
@ -87,36 +101,69 @@ void QueryMetricLog::shutdown()
Base::shutdown();
}
void QueryMetricLog::startQuery(const String & query_id, TimePoint start_time, UInt64 interval_milliseconds)
void QueryMetricLog::collectMetric(const ProcessList & process_list, String query_id)
{
QueryMetricLogStatus status;
status.interval_milliseconds = interval_milliseconds;
status.next_collect_time = start_time + std::chrono::milliseconds(interval_milliseconds);
auto current_time = std::chrono::system_clock::now();
const auto query_info = process_list.getQueryInfo(query_id, false, true, false);
if (!query_info)
{
LOG_TRACE(logger, "Query {} is not running anymore, so we couldn't get its QueryStatusInfo", query_id);
return;
}
LockGuard global_lock(queries_mutex);
auto it = queries.find(query_id);
/// The query might have finished while the scheduled task is running.
if (it == queries.end())
{
global_lock.unlock();
LOG_TRACE(logger, "Query {} not found in the list. Finished while this collecting task was running", query_id);
return;
}
auto & query_status = it->second;
if (!query_status.mutex)
{
global_lock.unlock();
LOG_TRACE(logger, "Query {} finished while this collecting task was running", query_id);
return;
}
LockGuard query_lock(query_status.getMutex());
global_lock.unlock();
auto elem = query_status.createLogMetricElement(query_id, *query_info, current_time);
if (elem)
add(std::move(elem.value()));
}
/// We use TSA_NO_THREAD_SAFETY_ANALYSIS to prevent TSA complaining that we're modifying the query_status fields
/// without locking the mutex. Since we're building it from scratch, there's no harm in not holding it.
/// If we locked it to make TSA happy, TSAN build would falsely complain about
/// lock-order-inversion (potential deadlock)
/// which is not a real issue since QueryMetricLogStatus's mutex cannot be locked by anything else
/// until we add it to the queries map.
void QueryMetricLog::startQuery(const String & query_id, TimePoint start_time, UInt64 interval_milliseconds) TSA_NO_THREAD_SAFETY_ANALYSIS
{
QueryMetricLogStatus query_status;
query_status.interval_milliseconds = interval_milliseconds;
query_status.next_collect_time = start_time + std::chrono::milliseconds(interval_milliseconds);
auto context = getContext();
const auto & process_list = context->getProcessList();
status.task = context->getSchedulePool().createTask("QueryMetricLog", [this, &process_list, query_id] {
auto current_time = std::chrono::system_clock::now();
const auto query_info = process_list.getQueryInfo(query_id, false, true, false);
if (!query_info)
{
LOG_TRACE(logger, "Query {} is not running anymore, so we couldn't get its QueryStatusInfo", query_id);
return;
}
auto elem = createLogMetricElement(query_id, *query_info, current_time);
if (elem)
add(std::move(elem.value()));
query_status.task = context->getSchedulePool().createTask("QueryMetricLog", [this, &process_list, query_id] {
collectMetric(process_list, query_id);
});
std::lock_guard lock(queries_mutex);
status.task->scheduleAfter(interval_milliseconds);
queries.emplace(query_id, std::move(status));
LockGuard global_lock(queries_mutex);
query_status.scheduleNext(query_id);
queries.emplace(query_id, std::move(query_status));
}
void QueryMetricLog::finishQuery(const String & query_id, TimePoint finish_time, QueryStatusInfoPtr query_info)
{
std::unique_lock lock(queries_mutex);
LockGuard global_lock(queries_mutex);
auto it = queries.find(query_id);
/// finishQuery may be called from logExceptionBeforeStart when the query has not even started
@ -124,9 +171,19 @@ void QueryMetricLog::finishQuery(const String & query_id, TimePoint finish_time,
if (it == queries.end())
return;
auto & query_status = it->second;
decltype(query_status.mutex) query_mutex;
LockGuard query_lock(query_status.getMutex());
/// Move the query mutex here so that we hold it until the end, after removing the query from queries.
query_mutex = std::move(query_status.mutex);
query_status.mutex = {};
global_lock.unlock();
if (query_info)
{
auto elem = createLogMetricElement(query_id, *query_info, finish_time, false);
auto elem = query_status.createLogMetricElement(query_id, *query_info, finish_time, false);
if (elem)
add(std::move(elem.value()));
}
@ -139,51 +196,62 @@ void QueryMetricLog::finishQuery(const String & query_id, TimePoint finish_time,
/// that order.
{
/// Take ownership of the task so that we can destroy it in this scope after unlocking `queries_mutex`.
auto task = std::move(it->second.task);
auto task = std::move(query_status.task);
/// Build an empty task for the old task to make sure it does not lock any mutex on its destruction.
it->second.task = {};
query_status.task = {};
query_lock.unlock();
global_lock.lock();
queries.erase(query_id);
/// Ensure `queries_mutex` is unlocked before calling task's destructor at the end of this
/// scope which will lock `exec_mutex`.
lock.unlock();
global_lock.unlock();
}
}
std::optional<QueryMetricLogElement> QueryMetricLog::createLogMetricElement(const String & query_id, const QueryStatusInfo & query_info, TimePoint query_info_time, bool schedule_next)
void QueryMetricLogStatus::scheduleNext(String query_id)
{
/// fmtlib supports subsecond formatting in 10.0.0. We're in 9.1.0, so we need to add the milliseconds ourselves.
auto seconds = std::chrono::time_point_cast<std::chrono::seconds>(query_info_time);
auto microseconds = std::chrono::duration_cast<std::chrono::microseconds>(query_info_time - seconds).count();
LOG_DEBUG(logger, "Collecting query_metric_log for query {} with QueryStatusInfo from {:%Y.%m.%d %H:%M:%S}.{:06}. Schedule next: {}", query_id, seconds, microseconds, schedule_next);
std::unique_lock lock(queries_mutex);
auto query_status_it = queries.find(query_id);
/// The query might have finished while the scheduled task is running.
if (query_status_it == queries.end())
const auto now = std::chrono::system_clock::now();
if (next_collect_time > now)
{
lock.unlock();
LOG_TRACE(logger, "Query {} finished already while this collecting task was running", query_id);
return {};
const auto wait_time = std::chrono::duration_cast<std::chrono::milliseconds>(next_collect_time - now).count();
task->scheduleAfter(wait_time);
}
auto & query_status = query_status_it->second;
if (query_info_time <= query_status.last_collect_time)
else
{
LOG_TRACE(logger, "The next collecting task for query {} should have already run at {}. Scheduling it right now",
query_id, timePointToString(next_collect_time));
task->schedule();
}
}
std::optional<QueryMetricLogElement> QueryMetricLogStatus::createLogMetricElement(const String & query_id, const QueryStatusInfo & query_info, TimePoint query_info_time, bool schedule_next)
{
LOG_TRACE(logger, "Collecting query_metric_log for query {} and interval {} ms with QueryStatusInfo from {}. Schedule next: {}",
query_id, interval_milliseconds, timePointToString(query_info_time), schedule_next);
if (query_info_time <= last_collect_time)
{
lock.unlock();
LOG_TRACE(logger, "Query {} has a more recent metrics collected. Skipping this one", query_id);
return {};
}
query_status.last_collect_time = query_info_time;
/// Leave some margin because task->scheduleAfter takes a value in milliseconds.
/// So, we can expect up to 1ms of drift since BackgroundSchedulePool will compare
/// time points in milliseconds.
static auto error_margin = std::chrono::milliseconds(1);
if (schedule_next && query_info_time + error_margin < next_collect_time)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Task to collect metric for query {} scheduled at {} but run at {}",
query_id, timePointToString(next_collect_time), timePointToString(query_info_time));
last_collect_time = query_info_time;
QueryMetricLogElement elem;
elem.event_time = timeInSeconds(query_info_time);
elem.event_time_microseconds = timeInMicroseconds(query_info_time);
elem.query_id = query_status_it->first;
elem.query_id = query_id;
elem.memory_usage = query_info.memory_usage > 0 ? query_info.memory_usage : 0;
elem.peak_memory_usage = query_info.peak_memory_usage > 0 ? query_info.peak_memory_usage : 0;
@ -192,7 +260,7 @@ std::optional<QueryMetricLogElement> QueryMetricLog::createLogMetricElement(cons
for (ProfileEvents::Event i = ProfileEvents::Event(0), end = ProfileEvents::end(); i < end; ++i)
{
const auto & new_value = (*(query_info.profile_counters))[i];
auto & old_value = query_status.last_profile_events[i];
auto & old_value = last_profile_events[i];
/// Profile event counters are supposed to be monotonic. However, at least the `NetworkReceiveBytes` can be inaccurate.
/// So, since in the future the counter should always have a bigger value than in the past, we skip this event.
@ -214,9 +282,8 @@ std::optional<QueryMetricLogElement> QueryMetricLog::createLogMetricElement(cons
if (schedule_next)
{
query_status.next_collect_time += std::chrono::milliseconds(query_status.interval_milliseconds);
const auto wait_time = std::chrono::duration_cast<std::chrono::milliseconds>(query_status.next_collect_time - std::chrono::system_clock::now()).count();
query_status.task->scheduleAfter(wait_time);
next_collect_time += std::chrono::milliseconds(interval_milliseconds);
scheduleNext(query_id);
}
return elem;

View File

@ -1,5 +1,6 @@
#pragma once
#include <base/defines.h>
#include <Common/ProfileEvents.h>
#include <Common/CurrentMetrics.h>
#include <Core/BackgroundSchedulePool.h>
@ -11,11 +12,17 @@
#include <chrono>
#include <ctime>
#include <mutex>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
};
/** QueryMetricLogElement is a log of query metric values measured at regular time interval.
*/
@ -36,31 +43,49 @@ struct QueryMetricLogElement
struct QueryMetricLogStatus
{
using TimePoint = std::chrono::system_clock::time_point;
using Mutex = std::mutex;
UInt64 interval_milliseconds;
std::chrono::system_clock::time_point last_collect_time;
std::chrono::system_clock::time_point next_collect_time;
std::vector<ProfileEvents::Count> last_profile_events = std::vector<ProfileEvents::Count>(ProfileEvents::end());
BackgroundSchedulePool::TaskHolder task;
std::chrono::system_clock::time_point last_collect_time TSA_GUARDED_BY(getMutex());
std::chrono::system_clock::time_point next_collect_time TSA_GUARDED_BY(getMutex());
std::vector<ProfileEvents::Count> last_profile_events TSA_GUARDED_BY(getMutex()) = std::vector<ProfileEvents::Count>(ProfileEvents::end());
BackgroundSchedulePool::TaskHolder task TSA_GUARDED_BY(getMutex());
/// We need to be able to move it for the hash map, so we need to add an indirection here.
std::unique_ptr<Mutex> mutex = std::make_unique<Mutex>();
/// Return a reference to the mutex, used for Thread Sanitizer annotations.
Mutex & getMutex() const TSA_RETURN_CAPABILITY(mutex)
{
if (!mutex)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Mutex cannot be NULL");
return *mutex;
}
void scheduleNext(String query_id) TSA_REQUIRES(getMutex());
std::optional<QueryMetricLogElement> createLogMetricElement(const String & query_id, const QueryStatusInfo & query_info, TimePoint query_info_time, bool schedule_next = true) TSA_REQUIRES(getMutex());
};
class QueryMetricLog : public SystemLog<QueryMetricLogElement>
{
using SystemLog<QueryMetricLogElement>::SystemLog;
using TimePoint = std::chrono::system_clock::time_point;
using Base = SystemLog<QueryMetricLogElement>;
public:
using TimePoint = std::chrono::system_clock::time_point;
void shutdown() final;
// Both startQuery and finishQuery are called from the thread that executes the query
/// Both startQuery and finishQuery are called from the thread that executes the query.
void startQuery(const String & query_id, TimePoint start_time, UInt64 interval_milliseconds);
void finishQuery(const String & query_id, TimePoint finish_time, QueryStatusInfoPtr query_info = nullptr);
private:
std::optional<QueryMetricLogElement> createLogMetricElement(const String & query_id, const QueryStatusInfo & query_info, TimePoint query_info_time, bool schedule_next = true);
void collectMetric(const ProcessList & process_list, String query_id);
std::recursive_mutex queries_mutex;
std::unordered_map<String, QueryMetricLogStatus> queries;
std::mutex queries_mutex;
std::unordered_map<String, QueryMetricLogStatus> queries TSA_GUARDED_BY(queries_mutex);
};
}