Merge pull request #71473 from ClickHouse/improve-system-query-metric-log

Improve performance of system.query_metric_log
This commit is contained in:
Pablo Marcos 2024-11-13 08:38:13 +00:00 committed by GitHub
commit 9caac43b2a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 187 additions and 72 deletions

View File

@ -145,6 +145,7 @@
#define TSA_TRY_ACQUIRE_SHARED(...) __attribute__((try_acquire_shared_capability(__VA_ARGS__))) /// function tries to acquire a shared capability and returns a boolean value indicating success or failure
#define TSA_RELEASE_SHARED(...) __attribute__((release_shared_capability(__VA_ARGS__))) /// function releases the given shared capability
#define TSA_SCOPED_LOCKABLE __attribute__((scoped_lockable)) /// object of a class has scoped lockable capability
#define TSA_RETURN_CAPABILITY(...) __attribute__((lock_returned(__VA_ARGS__))) /// to return capabilities in functions
/// Macros for suppressing TSA warnings for specific reads/writes (instead of suppressing it for the whole function)
/// They use a lambda function to apply function attribute to a single statement. This enable us to suppress warnings locally instead of

View File

@ -1,23 +1,47 @@
#pragma once
#include <Common/OvercommitTracker.h>
#include <base/defines.h>
#include <Common/Exception.h>
#include <Common/OvercommitTracker.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
};
/** LockGuard provides RAII-style locking mechanism for a mutex.
** It's intended to be used like std::unique_ptr but with TSA annotations
** It's intended to be used like std::unique_lock but with TSA annotations
*/
template <typename Mutex>
class TSA_SCOPED_LOCKABLE LockGuard
{
public:
explicit LockGuard(Mutex & mutex_) TSA_ACQUIRE(mutex_) : mutex(mutex_) { mutex.lock(); }
~LockGuard() TSA_RELEASE() { mutex.unlock(); }
explicit LockGuard(Mutex & mutex_) TSA_ACQUIRE(mutex_) : mutex(mutex_) { lock(); }
~LockGuard() TSA_RELEASE() { if (locked) unlock(); }
void lock() TSA_ACQUIRE()
{
/// Don't allow recursive_mutex for now.
if (locked)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't lock twice the same mutex");
mutex.lock();
locked = true;
}
void unlock() TSA_RELEASE()
{
if (!locked)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't unlock the mutex without locking it first");
mutex.unlock();
locked = false;
}
private:
Mutex & mutex;
bool locked = false;
};
template <template<typename> typename TLockGuard, typename Mutex>

View File

@ -1,6 +1,7 @@
#include <base/getFQDNOrHostName.h>
#include <Common/DateLUT.h>
#include <Common/DateLUTImpl.h>
#include <Common/LockGuard.h>
#include <DataTypes/DataTypeDate.h>
#include <DataTypes/DataTypeDateTime.h>
#include <DataTypes/DataTypeDateTime64.h>
@ -16,7 +17,6 @@
#include <chrono>
#include <fmt/chrono.h>
#include <mutex>
namespace DB
@ -24,6 +24,15 @@ namespace DB
static auto logger = getLogger("QueryMetricLog");
String timePointToString(QueryMetricLog::TimePoint time)
{
/// fmtlib supports subsecond formatting in 10.0.0. We're in 9.1.0, so we need to add the milliseconds ourselves.
auto seconds = std::chrono::time_point_cast<std::chrono::seconds>(time);
auto microseconds = std::chrono::duration_cast<std::chrono::microseconds>(time - seconds).count();
return fmt::format("{:%Y.%m.%d %H:%M:%S}.{:06}", seconds, microseconds);
}
ColumnsDescription QueryMetricLogElement::getColumnsDescription()
{
ColumnsDescription result;
@ -87,36 +96,73 @@ void QueryMetricLog::shutdown()
Base::shutdown();
}
void QueryMetricLog::startQuery(const String & query_id, TimePoint start_time, UInt64 interval_milliseconds)
void QueryMetricLog::collectMetric(const ProcessList & process_list, String query_id)
{
QueryMetricLogStatus status;
status.interval_milliseconds = interval_milliseconds;
status.next_collect_time = start_time + std::chrono::milliseconds(interval_milliseconds);
auto context = getContext();
const auto & process_list = context->getProcessList();
status.task = context->getSchedulePool().createTask("QueryMetricLog", [this, &process_list, query_id] {
auto current_time = std::chrono::system_clock::now();
const auto query_info = process_list.getQueryInfo(query_id, false, true, false);
if (!query_info)
{
/// TODO: remove trace before 24.11 release after checking everything is fine on the CI
LOG_TRACE(logger, "Query {} is not running anymore, so we couldn't get its QueryStatusInfo", query_id);
return;
}
auto elem = createLogMetricElement(query_id, *query_info, current_time);
LockGuard global_lock(queries_mutex);
auto it = queries.find(query_id);
/// The query might have finished while the scheduled task is running.
if (it == queries.end())
{
global_lock.unlock();
/// TODO: remove trace before 24.11 release after checking everything is fine on the CI
LOG_TRACE(logger, "Query {} not found in the list. Finished while this collecting task was running", query_id);
return;
}
auto & query_status = it->second;
if (!query_status.mutex)
{
global_lock.unlock();
/// TODO: remove trace before 24.11 release after checking everything is fine on the CI
LOG_TRACE(logger, "Query {} finished while this collecting task was running", query_id);
return;
}
LockGuard query_lock(query_status.getMutex());
global_lock.unlock();
auto elem = query_status.createLogMetricElement(query_id, *query_info, current_time);
if (elem)
add(std::move(elem.value()));
}
/// We use TSA_NO_THREAD_SAFETY_ANALYSIS to prevent TSA complaining that we're modifying the query_status fields
/// without locking the mutex. Since we're building it from scratch, there's no harm in not holding it.
/// If we locked it to make TSA happy, TSAN build would falsely complain about
/// lock-order-inversion (potential deadlock)
/// which is not a real issue since QueryMetricLogStatus's mutex cannot be locked by anything else
/// until we add it to the queries map.
void QueryMetricLog::startQuery(const String & query_id, TimePoint start_time, UInt64 interval_milliseconds) TSA_NO_THREAD_SAFETY_ANALYSIS
{
QueryMetricLogStatus query_status;
QueryMetricLogStatusInfo & info = query_status.info;
info.interval_milliseconds = interval_milliseconds;
info.next_collect_time = start_time;
auto context = getContext();
const auto & process_list = context->getProcessList();
info.task = context->getSchedulePool().createTask("QueryMetricLog", [this, &process_list, query_id] {
collectMetric(process_list, query_id);
});
std::lock_guard lock(queries_mutex);
status.task->scheduleAfter(interval_milliseconds);
queries.emplace(query_id, std::move(status));
LockGuard global_lock(queries_mutex);
query_status.scheduleNext(query_id);
queries.emplace(query_id, std::move(query_status));
}
void QueryMetricLog::finishQuery(const String & query_id, TimePoint finish_time, QueryStatusInfoPtr query_info)
{
std::unique_lock lock(queries_mutex);
LockGuard global_lock(queries_mutex);
auto it = queries.find(query_id);
/// finishQuery may be called from logExceptionBeforeStart when the query has not even started
@ -124,9 +170,19 @@ void QueryMetricLog::finishQuery(const String & query_id, TimePoint finish_time,
if (it == queries.end())
return;
auto & query_status = it->second;
decltype(query_status.mutex) query_mutex;
LockGuard query_lock(query_status.getMutex());
/// Move the query mutex here so that we hold it until the end, after removing the query from queries.
query_mutex = std::move(query_status.mutex);
query_status.mutex = {};
global_lock.unlock();
if (query_info)
{
auto elem = createLogMetricElement(query_id, *query_info, finish_time, false);
auto elem = query_status.createLogMetricElement(query_id, *query_info, finish_time, false);
if (elem)
add(std::move(elem.value()));
}
@ -139,51 +195,58 @@ void QueryMetricLog::finishQuery(const String & query_id, TimePoint finish_time,
/// that order.
{
/// Take ownership of the task so that we can destroy it in this scope after unlocking `queries_mutex`.
auto task = std::move(it->second.task);
auto task = std::move(query_status.info.task);
/// Build an empty task for the old task to make sure it does not lock any mutex on its destruction.
it->second.task = {};
query_status.info.task = {};
query_lock.unlock();
global_lock.lock();
queries.erase(query_id);
/// Ensure `queries_mutex` is unlocked before calling task's destructor at the end of this
/// scope which will lock `exec_mutex`.
lock.unlock();
global_lock.unlock();
}
}
std::optional<QueryMetricLogElement> QueryMetricLog::createLogMetricElement(const String & query_id, const QueryStatusInfo & query_info, TimePoint query_info_time, bool schedule_next)
void QueryMetricLogStatus::scheduleNext(String query_id)
{
/// fmtlib supports subsecond formatting in 10.0.0. We're in 9.1.0, so we need to add the milliseconds ourselves.
auto seconds = std::chrono::time_point_cast<std::chrono::seconds>(query_info_time);
auto microseconds = std::chrono::duration_cast<std::chrono::microseconds>(query_info_time - seconds).count();
LOG_DEBUG(logger, "Collecting query_metric_log for query {} with QueryStatusInfo from {:%Y.%m.%d %H:%M:%S}.{:06}. Schedule next: {}", query_id, seconds, microseconds, schedule_next);
std::unique_lock lock(queries_mutex);
auto query_status_it = queries.find(query_id);
/// The query might have finished while the scheduled task is running.
if (query_status_it == queries.end())
info.next_collect_time += std::chrono::milliseconds(info.interval_milliseconds);
const auto now = std::chrono::system_clock::now();
if (info.next_collect_time > now)
{
lock.unlock();
LOG_TRACE(logger, "Query {} finished already while this collecting task was running", query_id);
return {};
const auto wait_time = std::chrono::duration_cast<std::chrono::milliseconds>(info.next_collect_time - now).count();
info.task->scheduleAfter(wait_time);
}
else
{
LOG_TRACE(logger, "The next collecting task for query {} should have already run at {}. Scheduling it right now",
query_id, timePointToString(info.next_collect_time));
info.task->schedule();
}
}
auto & query_status = query_status_it->second;
if (query_info_time <= query_status.last_collect_time)
std::optional<QueryMetricLogElement> QueryMetricLogStatus::createLogMetricElement(const String & query_id, const QueryStatusInfo & query_info, TimePoint query_info_time, bool schedule_next)
{
lock.unlock();
/// TODO: remove trace before 24.11 release after checking everything is fine on the CI
LOG_TRACE(logger, "Collecting query_metric_log for query {} and interval {} ms with QueryStatusInfo from {}. Next collection time: {}",
query_id, info.interval_milliseconds, timePointToString(query_info_time),
schedule_next ? timePointToString(info.next_collect_time + std::chrono::milliseconds(info.interval_milliseconds)) : "finished");
if (query_info_time <= info.last_collect_time)
{
/// TODO: remove trace before 24.11 release after checking everything is fine on the CI
LOG_TRACE(logger, "Query {} has a more recent metrics collected. Skipping this one", query_id);
return {};
}
query_status.last_collect_time = query_info_time;
info.last_collect_time = query_info_time;
QueryMetricLogElement elem;
elem.event_time = timeInSeconds(query_info_time);
elem.event_time_microseconds = timeInMicroseconds(query_info_time);
elem.query_id = query_status_it->first;
elem.query_id = query_id;
elem.memory_usage = query_info.memory_usage > 0 ? query_info.memory_usage : 0;
elem.peak_memory_usage = query_info.peak_memory_usage > 0 ? query_info.peak_memory_usage : 0;
@ -192,7 +255,7 @@ std::optional<QueryMetricLogElement> QueryMetricLog::createLogMetricElement(cons
for (ProfileEvents::Event i = ProfileEvents::Event(0), end = ProfileEvents::end(); i < end; ++i)
{
const auto & new_value = (*(query_info.profile_counters))[i];
auto & old_value = query_status.last_profile_events[i];
auto & old_value = info.last_profile_events[i];
/// Profile event counters are supposed to be monotonic. However, at least the `NetworkReceiveBytes` can be inaccurate.
/// So, since in the future the counter should always have a bigger value than in the past, we skip this event.
@ -208,16 +271,13 @@ std::optional<QueryMetricLogElement> QueryMetricLog::createLogMetricElement(cons
}
else
{
LOG_TRACE(logger, "Query {} has no profile counters", query_id);
/// TODO: remove trace before 24.11 release after checking everything is fine on the CI
LOG_DEBUG(logger, "Query {} has no profile counters", query_id);
elem.profile_events = std::vector<ProfileEvents::Count>(ProfileEvents::end());
}
if (schedule_next)
{
query_status.next_collect_time += std::chrono::milliseconds(query_status.interval_milliseconds);
const auto wait_time = std::chrono::duration_cast<std::chrono::milliseconds>(query_status.next_collect_time - std::chrono::system_clock::now()).count();
query_status.task->scheduleAfter(wait_time);
}
scheduleNext(query_id);
return elem;
}

View File

@ -1,5 +1,6 @@
#pragma once
#include <base/defines.h>
#include <Common/ProfileEvents.h>
#include <Common/CurrentMetrics.h>
#include <Core/BackgroundSchedulePool.h>
@ -11,11 +12,17 @@
#include <chrono>
#include <ctime>
#include <mutex>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
};
/** QueryMetricLogElement is a log of query metric values measured at regular time interval.
*/
@ -34,7 +41,7 @@ struct QueryMetricLogElement
void appendToBlock(MutableColumns & columns) const;
};
struct QueryMetricLogStatus
struct QueryMetricLogStatusInfo
{
UInt64 interval_milliseconds;
std::chrono::system_clock::time_point last_collect_time;
@ -43,24 +50,47 @@ struct QueryMetricLogStatus
BackgroundSchedulePool::TaskHolder task;
};
struct QueryMetricLogStatus
{
using TimePoint = std::chrono::system_clock::time_point;
using Mutex = std::mutex;
QueryMetricLogStatusInfo info TSA_GUARDED_BY(getMutex());
/// We need to be able to move it for the hash map, so we need to add an indirection here.
std::unique_ptr<Mutex> mutex = std::make_unique<Mutex>();
/// Return a reference to the mutex, used for Thread Sanitizer annotations.
Mutex & getMutex() const TSA_RETURN_CAPABILITY(mutex)
{
if (!mutex)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Mutex cannot be NULL");
return *mutex;
}
void scheduleNext(String query_id) TSA_REQUIRES(getMutex());
std::optional<QueryMetricLogElement> createLogMetricElement(const String & query_id, const QueryStatusInfo & query_info, TimePoint query_info_time, bool schedule_next = true) TSA_REQUIRES(getMutex());
};
class QueryMetricLog : public SystemLog<QueryMetricLogElement>
{
using SystemLog<QueryMetricLogElement>::SystemLog;
using TimePoint = std::chrono::system_clock::time_point;
using Base = SystemLog<QueryMetricLogElement>;
public:
using TimePoint = std::chrono::system_clock::time_point;
void shutdown() final;
// Both startQuery and finishQuery are called from the thread that executes the query
/// Both startQuery and finishQuery are called from the thread that executes the query.
void startQuery(const String & query_id, TimePoint start_time, UInt64 interval_milliseconds);
void finishQuery(const String & query_id, TimePoint finish_time, QueryStatusInfoPtr query_info = nullptr);
private:
std::optional<QueryMetricLogElement> createLogMetricElement(const String & query_id, const QueryStatusInfo & query_info, TimePoint query_info_time, bool schedule_next = true);
void collectMetric(const ProcessList & process_list, String query_id);
std::recursive_mutex queries_mutex;
std::unordered_map<String, QueryMetricLogStatus> queries;
std::mutex queries_mutex;
std::unordered_map<String, QueryMetricLogStatus> queries TSA_GUARDED_BY(queries_mutex);
};
}

View File

@ -505,6 +505,7 @@ void logQueryFinish(
auto time_now = std::chrono::system_clock::now();
QueryStatusInfo info = process_list_elem->getInfo(true, settings[Setting::log_profile_events]);
logQueryMetricLogFinish(context, internal, elem.client_info.current_query_id, time_now, std::make_shared<QueryStatusInfo>(info));
elem.type = QueryLogElementType::QUERY_FINISH;
addStatusInfoToQueryLogElement(elem, info, query_ast, context);
@ -623,6 +624,7 @@ void logQueryException(
{
elem.query_duration_ms = start_watch.elapsedMilliseconds();
}
logQueryMetricLogFinish(context, internal, elem.client_info.current_query_id, time_now, info);
elem.query_cache_usage = QueryCache::Usage::None;
@ -652,8 +654,6 @@ void logQueryException(
query_span->addAttribute("clickhouse.exception_code", elem.exception_code);
query_span->finish();
}
logQueryMetricLogFinish(context, internal, elem.client_info.current_query_id, time_now, info);
}
void logExceptionBeforeStart(
@ -707,6 +707,8 @@ void logExceptionBeforeStart(
elem.client_info = context->getClientInfo();
logQueryMetricLogFinish(context, false, elem.client_info.current_query_id, std::chrono::system_clock::now(), nullptr);
elem.log_comment = settings[Setting::log_comment];
if (elem.log_comment.size() > settings[Setting::max_query_size])
elem.log_comment.resize(settings[Setting::max_query_size]);
@ -751,8 +753,6 @@ void logExceptionBeforeStart(
ProfileEvents::increment(ProfileEvents::FailedInsertQuery);
}
}
logQueryMetricLogFinish(context, false, elem.client_info.current_query_id, std::chrono::system_clock::now(), nullptr);
}
void validateAnalyzerSettings(ASTPtr ast, bool context_value)

View File

@ -23,8 +23,8 @@
--Interval 123: check that the SleepFunctionCalls, SleepFunctionMilliseconds and ProfileEvent_SleepFunctionElapsedMicroseconds are correct
1
--Check that a query_metric_log_interval=0 disables the collection
0
1
-Check that a query which execution time is less than query_metric_log_interval is never collected
0
1
--Check that there is a final event when queries finish
3
1

View File

@ -84,17 +84,17 @@ check_log 123
# query_metric_log_interval=0 disables the collection altogether
$CLICKHOUSE_CLIENT -m -q """
SELECT '--Check that a query_metric_log_interval=0 disables the collection';
SELECT count() FROM system.query_metric_log WHERE event_date >= yesterday() AND query_id = '${query_prefix}_0'
SELECT count() == 0 FROM system.query_metric_log WHERE event_date >= yesterday() AND query_id = '${query_prefix}_0'
"""
# a quick query that takes less than query_metric_log_interval is never collected
$CLICKHOUSE_CLIENT -m -q """
SELECT '-Check that a query which execution time is less than query_metric_log_interval is never collected';
SELECT count() FROM system.query_metric_log WHERE event_date >= yesterday() AND query_id = '${query_prefix}_fast'
SELECT count() == 0 FROM system.query_metric_log WHERE event_date >= yesterday() AND query_id = '${query_prefix}_fast'
"""
# a query that takes more than query_metric_log_interval is collected including the final row
$CLICKHOUSE_CLIENT -m -q """
SELECT '--Check that there is a final event when queries finish';
SELECT count() FROM system.query_metric_log WHERE event_date >= yesterday() AND query_id = '${query_prefix}_1000'
SELECT count() > 2 FROM system.query_metric_log WHERE event_date >= yesterday() AND query_id = '${query_prefix}_1000'
"""