From c9b843a6bb6563f0e00860102c76ecd021015c8e Mon Sep 17 00:00:00 2001 From: Pablo Marcos Date: Tue, 19 Nov 2024 10:54:43 +0000 Subject: [PATCH 1/2] Demote LOG_DEBUG and LOG_TRACE to LOG_TEST --- src/Interpreters/QueryMetricLog.cpp | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/src/Interpreters/QueryMetricLog.cpp b/src/Interpreters/QueryMetricLog.cpp index 52d773b7d1b..34f04b355ab 100644 --- a/src/Interpreters/QueryMetricLog.cpp +++ b/src/Interpreters/QueryMetricLog.cpp @@ -102,8 +102,7 @@ void QueryMetricLog::collectMetric(const ProcessList & process_list, String quer const auto query_info = process_list.getQueryInfo(query_id, false, true, false); if (!query_info) { - /// TODO: remove trace before 24.11 release after checking everything is fine on the CI - LOG_TRACE(logger, "Query {} is not running anymore, so we couldn't get its QueryStatusInfo", query_id); + LOG_TEST(logger, "Query {} is not running anymore, so we couldn't get its QueryStatusInfo", query_id); return; } @@ -114,8 +113,7 @@ void QueryMetricLog::collectMetric(const ProcessList & process_list, String quer if (it == queries.end()) { global_lock.unlock(); - /// TODO: remove trace before 24.11 release after checking everything is fine on the CI - LOG_TRACE(logger, "Query {} not found in the list. Finished while this collecting task was running", query_id); + LOG_TEST(logger, "Query {} not found in the list. Finished while this collecting task was running", query_id); return; } @@ -123,8 +121,7 @@ void QueryMetricLog::collectMetric(const ProcessList & process_list, String quer if (!query_status.mutex) { global_lock.unlock(); - /// TODO: remove trace before 24.11 release after checking everything is fine on the CI - LOG_TRACE(logger, "Query {} finished while this collecting task was running", query_id); + LOG_TEST(logger, "Query {} finished while this collecting task was running", query_id); return; } @@ -221,7 +218,7 @@ void QueryMetricLogStatus::scheduleNext(String query_id) } else { - LOG_TRACE(logger, "The next collecting task for query {} should have already run at {}. Scheduling it right now", + LOG_TEST(logger, "The next collecting task for query {} should have already run at {}. Scheduling it right now", query_id, timePointToString(info.next_collect_time)); info.task->schedule(); } @@ -229,15 +226,13 @@ void QueryMetricLogStatus::scheduleNext(String query_id) std::optional QueryMetricLogStatus::createLogMetricElement(const String & query_id, const QueryStatusInfo & query_info, TimePoint query_info_time, bool schedule_next) { - /// TODO: remove trace before 24.11 release after checking everything is fine on the CI - LOG_TRACE(logger, "Collecting query_metric_log for query {} and interval {} ms with QueryStatusInfo from {}. Next collection time: {}", + LOG_TEST(logger, "Collecting query_metric_log for query {} and interval {} ms with QueryStatusInfo from {}. Next collection time: {}", query_id, info.interval_milliseconds, timePointToString(query_info_time), schedule_next ? timePointToString(info.next_collect_time + std::chrono::milliseconds(info.interval_milliseconds)) : "finished"); if (query_info_time <= info.last_collect_time) { - /// TODO: remove trace before 24.11 release after checking everything is fine on the CI - LOG_TRACE(logger, "Query {} has a more recent metrics collected. Skipping this one", query_id); + LOG_TEST(logger, "Query {} has a more recent metrics collected. Skipping this one", query_id); return {}; } @@ -271,8 +266,7 @@ std::optional QueryMetricLogStatus::createLogMetricElemen } else { - /// TODO: remove trace before 24.11 release after checking everything is fine on the CI - LOG_DEBUG(logger, "Query {} has no profile counters", query_id); + LOG_TEST(logger, "Query {} has no profile counters", query_id); elem.profile_events = std::vector(ProfileEvents::end()); } From b4a6ee77ab8ade7ed816645a416dc8f5c8d3d11f Mon Sep 17 00:00:00 2001 From: Pablo Marcos Date: Tue, 19 Nov 2024 10:59:59 +0000 Subject: [PATCH 2/2] Add more diff margin depending on number of event rows There's been a case where the avg(diff) for the 1000ms query has been 1205ms, which is over 1000 * 1.2. In order to leave more margin for the case where the sample of rows is low, while leaving it more strict when the number of rows is higher, we use an adaptive margin: 20 + 20/event_rows --- .../03203_system_query_metric_log.sh | 36 +++++++++++-------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/tests/queries/0_stateless/03203_system_query_metric_log.sh b/tests/queries/0_stateless/03203_system_query_metric_log.sh index abcd14c8e5d..d8ae078fd5e 100755 --- a/tests/queries/0_stateless/03203_system_query_metric_log.sh +++ b/tests/queries/0_stateless/03203_system_query_metric_log.sh @@ -32,23 +32,31 @@ function check_log() # We calculate the diff of each row with its previous row to check whether the intervals at # which data is collected is right. The first row is always skipped because the diff with the # preceding one (itself) is 0. The last row is also skipped, because it doesn't contain a full - # interval. + # interval. We leave at least 20% of margin for many rows and at most 40% for one single row. $CLICKHOUSE_CLIENT --max_threads=1 -m -q """ SELECT '--Interval $interval: check that the delta/diff between the events is correct'; - WITH diff AS ( - SELECT - row_number() OVER () AS row, - count() OVER () as total_rows, - event_time_microseconds, - first_value(event_time_microseconds) OVER (ORDER BY event_time_microseconds ROWS BETWEEN 1 PRECEDING AND 0 FOLLOWING) as prev, - dateDiff('ms', prev, event_time_microseconds) AS diff - FROM system.query_metric_log - WHERE event_date >= yesterday() AND query_id = '${query_prefix}_${interval}' - ORDER BY event_time_microseconds - OFFSET 1 + WITH + (SELECT count() - 2 FROM system.query_metric_log WHERE event_date >= yesterday() AND query_id = '${query_prefix}_${interval}') as diff_rows, + (20 + 20 / diff_rows)/100 AS margin + SELECT + avg(diff) BETWEEN (1 - margin) * $interval AND (1 + margin) * $interval + FROM ( + WITH diff AS ( + SELECT + row_number() OVER () AS row, + count() OVER () as total_rows, + event_time_microseconds, + first_value(event_time_microseconds) OVER (ORDER BY event_time_microseconds ROWS BETWEEN 1 PRECEDING AND 0 FOLLOWING) as prev, + dateDiff('ms', prev, event_time_microseconds) AS diff + FROM system.query_metric_log + WHERE event_date >= yesterday() AND query_id = '${query_prefix}_${interval}' + ORDER BY event_time_microseconds + OFFSET 1 + ) + SELECT avg(diff) AS diff + FROM diff + WHERE row < total_rows ) - SELECT avg(diff) BETWEEN $interval * 0.8 AND $interval * 1.2 - FROM diff WHERE row < total_rows """ # Check that the first event contains information from the beginning of the query.