mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-23 16:12:01 +00:00
Reduce the general critical section for query_metric_log
- Use a separate mutex for each query to reduce the contention period for queries_mutex. - Refactor to use std::mutex instead of std::recursive_mutex for queries_mutex. - In case we're running late to schedule the next task, schedule it immediately. - Fix LockGuard because unlocking twice is undefined behavior.
This commit is contained in:
parent
bac948ec0e
commit
9cdd56abbc
@ -145,6 +145,7 @@
|
||||
#define TSA_TRY_ACQUIRE_SHARED(...) __attribute__((try_acquire_shared_capability(__VA_ARGS__))) /// function tries to acquire a shared capability and returns a boolean value indicating success or failure
|
||||
#define TSA_RELEASE_SHARED(...) __attribute__((release_shared_capability(__VA_ARGS__))) /// function releases the given shared capability
|
||||
#define TSA_SCOPED_LOCKABLE __attribute__((scoped_lockable)) /// object of a class has scoped lockable capability
|
||||
#define TSA_RETURN_CAPABILITY(...) __attribute__((lock_returned(__VA_ARGS__))) /// to return capabilities in functions
|
||||
|
||||
/// Macros for suppressing TSA warnings for specific reads/writes (instead of suppressing it for the whole function)
|
||||
/// They use a lambda function to apply function attribute to a single statement. This enable us to suppress warnings locally instead of
|
||||
|
@ -1,23 +1,47 @@
|
||||
#pragma once
|
||||
|
||||
#include <Common/OvercommitTracker.h>
|
||||
#include <base/defines.h>
|
||||
#include <Common/Exception.h>
|
||||
#include <Common/OvercommitTracker.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int LOGICAL_ERROR;
|
||||
};
|
||||
|
||||
/** LockGuard provides RAII-style locking mechanism for a mutex.
|
||||
** It's intended to be used like std::unique_ptr but with TSA annotations
|
||||
** It's intended to be used like std::unique_lock but with TSA annotations
|
||||
*/
|
||||
template <typename Mutex>
|
||||
class TSA_SCOPED_LOCKABLE LockGuard
|
||||
{
|
||||
public:
|
||||
explicit LockGuard(Mutex & mutex_) TSA_ACQUIRE(mutex_) : mutex(mutex_) { mutex.lock(); }
|
||||
~LockGuard() TSA_RELEASE() { mutex.unlock(); }
|
||||
explicit LockGuard(Mutex & mutex_) TSA_ACQUIRE(mutex_) : mutex(mutex_) { lock(); }
|
||||
~LockGuard() TSA_RELEASE() { if (locked) unlock(); }
|
||||
|
||||
void lock() TSA_ACQUIRE()
|
||||
{
|
||||
/// Don't allow recursive_mutex for now.
|
||||
if (locked)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't lock twice the same mutex");
|
||||
mutex.lock();
|
||||
locked = true;
|
||||
}
|
||||
|
||||
void unlock() TSA_RELEASE()
|
||||
{
|
||||
if (!locked)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't unlock the mutex without locking it first");
|
||||
mutex.unlock();
|
||||
locked = false;
|
||||
}
|
||||
|
||||
private:
|
||||
Mutex & mutex;
|
||||
bool locked = false;
|
||||
};
|
||||
|
||||
template <template<typename> typename TLockGuard, typename Mutex>
|
||||
|
@ -1,6 +1,7 @@
|
||||
#include <base/getFQDNOrHostName.h>
|
||||
#include <Common/DateLUT.h>
|
||||
#include <Common/DateLUTImpl.h>
|
||||
#include <Common/LockGuard.h>
|
||||
#include <DataTypes/DataTypeDate.h>
|
||||
#include <DataTypes/DataTypeDateTime.h>
|
||||
#include <DataTypes/DataTypeDateTime64.h>
|
||||
@ -16,7 +17,6 @@
|
||||
|
||||
#include <chrono>
|
||||
#include <fmt/chrono.h>
|
||||
#include <mutex>
|
||||
|
||||
|
||||
namespace DB
|
||||
@ -24,6 +24,20 @@ namespace DB
|
||||
|
||||
static auto logger = getLogger("QueryMetricLog");
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int LOGICAL_ERROR;
|
||||
};
|
||||
|
||||
String timePointToString(QueryMetricLog::TimePoint time)
|
||||
{
|
||||
/// fmtlib supports subsecond formatting in 10.0.0. We're in 9.1.0, so we need to add the milliseconds ourselves.
|
||||
auto seconds = std::chrono::time_point_cast<std::chrono::seconds>(time);
|
||||
auto microseconds = std::chrono::duration_cast<std::chrono::microseconds>(time - seconds).count();
|
||||
|
||||
return fmt::format("{:%Y.%m.%d %H:%M:%S}.{:06}", seconds, microseconds);
|
||||
}
|
||||
|
||||
ColumnsDescription QueryMetricLogElement::getColumnsDescription()
|
||||
{
|
||||
ColumnsDescription result;
|
||||
@ -87,36 +101,69 @@ void QueryMetricLog::shutdown()
|
||||
Base::shutdown();
|
||||
}
|
||||
|
||||
void QueryMetricLog::startQuery(const String & query_id, TimePoint start_time, UInt64 interval_milliseconds)
|
||||
void QueryMetricLog::collectMetric(const ProcessList & process_list, String query_id)
|
||||
{
|
||||
QueryMetricLogStatus status;
|
||||
status.interval_milliseconds = interval_milliseconds;
|
||||
status.next_collect_time = start_time + std::chrono::milliseconds(interval_milliseconds);
|
||||
auto current_time = std::chrono::system_clock::now();
|
||||
const auto query_info = process_list.getQueryInfo(query_id, false, true, false);
|
||||
if (!query_info)
|
||||
{
|
||||
LOG_TRACE(logger, "Query {} is not running anymore, so we couldn't get its QueryStatusInfo", query_id);
|
||||
return;
|
||||
}
|
||||
|
||||
LockGuard global_lock(queries_mutex);
|
||||
auto it = queries.find(query_id);
|
||||
|
||||
/// The query might have finished while the scheduled task is running.
|
||||
if (it == queries.end())
|
||||
{
|
||||
global_lock.unlock();
|
||||
LOG_TRACE(logger, "Query {} not found in the list. Finished while this collecting task was running", query_id);
|
||||
return;
|
||||
}
|
||||
|
||||
auto & query_status = it->second;
|
||||
if (!query_status.mutex)
|
||||
{
|
||||
global_lock.unlock();
|
||||
LOG_TRACE(logger, "Query {} finished while this collecting task was running", query_id);
|
||||
return;
|
||||
}
|
||||
|
||||
LockGuard query_lock(query_status.getMutex());
|
||||
global_lock.unlock();
|
||||
|
||||
auto elem = query_status.createLogMetricElement(query_id, *query_info, current_time);
|
||||
if (elem)
|
||||
add(std::move(elem.value()));
|
||||
}
|
||||
|
||||
/// We use TSA_NO_THREAD_SAFETY_ANALYSIS to prevent TSA complaining that we're modifying the query_status fields
|
||||
/// without locking the mutex. Since we're building it from scratch, there's no harm in not holding it.
|
||||
/// If we locked it to make TSA happy, TSAN build would falsely complain about
|
||||
/// lock-order-inversion (potential deadlock)
|
||||
/// which is not a real issue since QueryMetricLogStatus's mutex cannot be locked by anything else
|
||||
/// until we add it to the queries map.
|
||||
void QueryMetricLog::startQuery(const String & query_id, TimePoint start_time, UInt64 interval_milliseconds) TSA_NO_THREAD_SAFETY_ANALYSIS
|
||||
{
|
||||
QueryMetricLogStatus query_status;
|
||||
query_status.interval_milliseconds = interval_milliseconds;
|
||||
query_status.next_collect_time = start_time + std::chrono::milliseconds(interval_milliseconds);
|
||||
|
||||
auto context = getContext();
|
||||
const auto & process_list = context->getProcessList();
|
||||
status.task = context->getSchedulePool().createTask("QueryMetricLog", [this, &process_list, query_id] {
|
||||
auto current_time = std::chrono::system_clock::now();
|
||||
const auto query_info = process_list.getQueryInfo(query_id, false, true, false);
|
||||
if (!query_info)
|
||||
{
|
||||
LOG_TRACE(logger, "Query {} is not running anymore, so we couldn't get its QueryStatusInfo", query_id);
|
||||
return;
|
||||
}
|
||||
|
||||
auto elem = createLogMetricElement(query_id, *query_info, current_time);
|
||||
if (elem)
|
||||
add(std::move(elem.value()));
|
||||
query_status.task = context->getSchedulePool().createTask("QueryMetricLog", [this, &process_list, query_id] {
|
||||
collectMetric(process_list, query_id);
|
||||
});
|
||||
|
||||
std::lock_guard lock(queries_mutex);
|
||||
status.task->scheduleAfter(interval_milliseconds);
|
||||
queries.emplace(query_id, std::move(status));
|
||||
LockGuard global_lock(queries_mutex);
|
||||
query_status.scheduleNext(query_id);
|
||||
queries.emplace(query_id, std::move(query_status));
|
||||
}
|
||||
|
||||
void QueryMetricLog::finishQuery(const String & query_id, TimePoint finish_time, QueryStatusInfoPtr query_info)
|
||||
{
|
||||
std::unique_lock lock(queries_mutex);
|
||||
LockGuard global_lock(queries_mutex);
|
||||
auto it = queries.find(query_id);
|
||||
|
||||
/// finishQuery may be called from logExceptionBeforeStart when the query has not even started
|
||||
@ -124,9 +171,19 @@ void QueryMetricLog::finishQuery(const String & query_id, TimePoint finish_time,
|
||||
if (it == queries.end())
|
||||
return;
|
||||
|
||||
auto & query_status = it->second;
|
||||
decltype(query_status.mutex) query_mutex;
|
||||
LockGuard query_lock(query_status.getMutex());
|
||||
|
||||
/// Move the query mutex here so that we hold it until the end, after removing the query from queries.
|
||||
query_mutex = std::move(query_status.mutex);
|
||||
query_status.mutex = {};
|
||||
|
||||
global_lock.unlock();
|
||||
|
||||
if (query_info)
|
||||
{
|
||||
auto elem = createLogMetricElement(query_id, *query_info, finish_time, false);
|
||||
auto elem = query_status.createLogMetricElement(query_id, *query_info, finish_time, false);
|
||||
if (elem)
|
||||
add(std::move(elem.value()));
|
||||
}
|
||||
@ -139,51 +196,62 @@ void QueryMetricLog::finishQuery(const String & query_id, TimePoint finish_time,
|
||||
/// that order.
|
||||
{
|
||||
/// Take ownership of the task so that we can destroy it in this scope after unlocking `queries_mutex`.
|
||||
auto task = std::move(it->second.task);
|
||||
auto task = std::move(query_status.task);
|
||||
|
||||
/// Build an empty task for the old task to make sure it does not lock any mutex on its destruction.
|
||||
it->second.task = {};
|
||||
query_status.task = {};
|
||||
query_lock.unlock();
|
||||
|
||||
global_lock.lock();
|
||||
queries.erase(query_id);
|
||||
|
||||
/// Ensure `queries_mutex` is unlocked before calling task's destructor at the end of this
|
||||
/// scope which will lock `exec_mutex`.
|
||||
lock.unlock();
|
||||
global_lock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
std::optional<QueryMetricLogElement> QueryMetricLog::createLogMetricElement(const String & query_id, const QueryStatusInfo & query_info, TimePoint query_info_time, bool schedule_next)
|
||||
void QueryMetricLogStatus::scheduleNext(String query_id)
|
||||
{
|
||||
/// fmtlib supports subsecond formatting in 10.0.0. We're in 9.1.0, so we need to add the milliseconds ourselves.
|
||||
auto seconds = std::chrono::time_point_cast<std::chrono::seconds>(query_info_time);
|
||||
auto microseconds = std::chrono::duration_cast<std::chrono::microseconds>(query_info_time - seconds).count();
|
||||
LOG_DEBUG(logger, "Collecting query_metric_log for query {} with QueryStatusInfo from {:%Y.%m.%d %H:%M:%S}.{:06}. Schedule next: {}", query_id, seconds, microseconds, schedule_next);
|
||||
|
||||
std::unique_lock lock(queries_mutex);
|
||||
auto query_status_it = queries.find(query_id);
|
||||
|
||||
/// The query might have finished while the scheduled task is running.
|
||||
if (query_status_it == queries.end())
|
||||
const auto now = std::chrono::system_clock::now();
|
||||
if (next_collect_time > now)
|
||||
{
|
||||
lock.unlock();
|
||||
LOG_TRACE(logger, "Query {} finished already while this collecting task was running", query_id);
|
||||
return {};
|
||||
const auto wait_time = std::chrono::duration_cast<std::chrono::milliseconds>(next_collect_time - now).count();
|
||||
task->scheduleAfter(wait_time);
|
||||
}
|
||||
|
||||
auto & query_status = query_status_it->second;
|
||||
if (query_info_time <= query_status.last_collect_time)
|
||||
else
|
||||
{
|
||||
LOG_TRACE(logger, "The next collecting task for query {} should have already run at {}. Scheduling it right now",
|
||||
query_id, timePointToString(next_collect_time));
|
||||
task->schedule();
|
||||
}
|
||||
}
|
||||
|
||||
std::optional<QueryMetricLogElement> QueryMetricLogStatus::createLogMetricElement(const String & query_id, const QueryStatusInfo & query_info, TimePoint query_info_time, bool schedule_next)
|
||||
{
|
||||
LOG_TRACE(logger, "Collecting query_metric_log for query {} and interval {} ms with QueryStatusInfo from {}. Schedule next: {}",
|
||||
query_id, interval_milliseconds, timePointToString(query_info_time), schedule_next);
|
||||
|
||||
if (query_info_time <= last_collect_time)
|
||||
{
|
||||
lock.unlock();
|
||||
LOG_TRACE(logger, "Query {} has a more recent metrics collected. Skipping this one", query_id);
|
||||
return {};
|
||||
}
|
||||
|
||||
query_status.last_collect_time = query_info_time;
|
||||
/// Leave some margin because task->scheduleAfter takes a value in milliseconds.
|
||||
/// So, we can expect up to 1ms of drift since BackgroundSchedulePool will compare
|
||||
/// time points in milliseconds.
|
||||
static auto error_margin = std::chrono::milliseconds(1);
|
||||
if (schedule_next && query_info_time + error_margin < next_collect_time)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Task to collect metric for query {} scheduled at {} but run at {}",
|
||||
query_id, timePointToString(next_collect_time), timePointToString(query_info_time));
|
||||
|
||||
last_collect_time = query_info_time;
|
||||
|
||||
QueryMetricLogElement elem;
|
||||
elem.event_time = timeInSeconds(query_info_time);
|
||||
elem.event_time_microseconds = timeInMicroseconds(query_info_time);
|
||||
elem.query_id = query_status_it->first;
|
||||
elem.query_id = query_id;
|
||||
elem.memory_usage = query_info.memory_usage > 0 ? query_info.memory_usage : 0;
|
||||
elem.peak_memory_usage = query_info.peak_memory_usage > 0 ? query_info.peak_memory_usage : 0;
|
||||
|
||||
@ -192,7 +260,7 @@ std::optional<QueryMetricLogElement> QueryMetricLog::createLogMetricElement(cons
|
||||
for (ProfileEvents::Event i = ProfileEvents::Event(0), end = ProfileEvents::end(); i < end; ++i)
|
||||
{
|
||||
const auto & new_value = (*(query_info.profile_counters))[i];
|
||||
auto & old_value = query_status.last_profile_events[i];
|
||||
auto & old_value = last_profile_events[i];
|
||||
|
||||
/// Profile event counters are supposed to be monotonic. However, at least the `NetworkReceiveBytes` can be inaccurate.
|
||||
/// So, since in the future the counter should always have a bigger value than in the past, we skip this event.
|
||||
@ -214,9 +282,8 @@ std::optional<QueryMetricLogElement> QueryMetricLog::createLogMetricElement(cons
|
||||
|
||||
if (schedule_next)
|
||||
{
|
||||
query_status.next_collect_time += std::chrono::milliseconds(query_status.interval_milliseconds);
|
||||
const auto wait_time = std::chrono::duration_cast<std::chrono::milliseconds>(query_status.next_collect_time - std::chrono::system_clock::now()).count();
|
||||
query_status.task->scheduleAfter(wait_time);
|
||||
next_collect_time += std::chrono::milliseconds(interval_milliseconds);
|
||||
scheduleNext(query_id);
|
||||
}
|
||||
|
||||
return elem;
|
||||
|
@ -1,5 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <base/defines.h>
|
||||
#include <Common/ProfileEvents.h>
|
||||
#include <Common/CurrentMetrics.h>
|
||||
#include <Core/BackgroundSchedulePool.h>
|
||||
@ -11,11 +12,17 @@
|
||||
|
||||
#include <chrono>
|
||||
#include <ctime>
|
||||
#include <mutex>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int LOGICAL_ERROR;
|
||||
};
|
||||
|
||||
/** QueryMetricLogElement is a log of query metric values measured at regular time interval.
|
||||
*/
|
||||
|
||||
@ -36,31 +43,49 @@ struct QueryMetricLogElement
|
||||
|
||||
struct QueryMetricLogStatus
|
||||
{
|
||||
using TimePoint = std::chrono::system_clock::time_point;
|
||||
using Mutex = std::mutex;
|
||||
|
||||
UInt64 interval_milliseconds;
|
||||
std::chrono::system_clock::time_point last_collect_time;
|
||||
std::chrono::system_clock::time_point next_collect_time;
|
||||
std::vector<ProfileEvents::Count> last_profile_events = std::vector<ProfileEvents::Count>(ProfileEvents::end());
|
||||
BackgroundSchedulePool::TaskHolder task;
|
||||
std::chrono::system_clock::time_point last_collect_time TSA_GUARDED_BY(getMutex());
|
||||
std::chrono::system_clock::time_point next_collect_time TSA_GUARDED_BY(getMutex());
|
||||
std::vector<ProfileEvents::Count> last_profile_events TSA_GUARDED_BY(getMutex()) = std::vector<ProfileEvents::Count>(ProfileEvents::end());
|
||||
BackgroundSchedulePool::TaskHolder task TSA_GUARDED_BY(getMutex());
|
||||
|
||||
/// We need to be able to move it for the hash map, so we need to add an indirection here.
|
||||
std::unique_ptr<Mutex> mutex = std::make_unique<Mutex>();
|
||||
|
||||
/// Return a reference to the mutex, used for Thread Sanitizer annotations.
|
||||
Mutex & getMutex() const TSA_RETURN_CAPABILITY(mutex)
|
||||
{
|
||||
if (!mutex)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Mutex cannot be NULL");
|
||||
return *mutex;
|
||||
}
|
||||
|
||||
void scheduleNext(String query_id) TSA_REQUIRES(getMutex());
|
||||
std::optional<QueryMetricLogElement> createLogMetricElement(const String & query_id, const QueryStatusInfo & query_info, TimePoint query_info_time, bool schedule_next = true) TSA_REQUIRES(getMutex());
|
||||
};
|
||||
|
||||
class QueryMetricLog : public SystemLog<QueryMetricLogElement>
|
||||
{
|
||||
using SystemLog<QueryMetricLogElement>::SystemLog;
|
||||
using TimePoint = std::chrono::system_clock::time_point;
|
||||
using Base = SystemLog<QueryMetricLogElement>;
|
||||
|
||||
public:
|
||||
using TimePoint = std::chrono::system_clock::time_point;
|
||||
|
||||
void shutdown() final;
|
||||
|
||||
// Both startQuery and finishQuery are called from the thread that executes the query
|
||||
/// Both startQuery and finishQuery are called from the thread that executes the query.
|
||||
void startQuery(const String & query_id, TimePoint start_time, UInt64 interval_milliseconds);
|
||||
void finishQuery(const String & query_id, TimePoint finish_time, QueryStatusInfoPtr query_info = nullptr);
|
||||
|
||||
private:
|
||||
std::optional<QueryMetricLogElement> createLogMetricElement(const String & query_id, const QueryStatusInfo & query_info, TimePoint query_info_time, bool schedule_next = true);
|
||||
void collectMetric(const ProcessList & process_list, String query_id);
|
||||
|
||||
std::recursive_mutex queries_mutex;
|
||||
std::unordered_map<String, QueryMetricLogStatus> queries;
|
||||
std::mutex queries_mutex;
|
||||
std::unordered_map<String, QueryMetricLogStatus> queries TSA_GUARDED_BY(queries_mutex);
|
||||
};
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user