Improve system.query_metric_log to remove flakiness

- Use an interval of 400ms instead of 1234ms for the 2500ms
  sleep to avoid having a last collection that may clash
  with the finish one.
- Move the check for number of events to a separate check.
  This way we don't have to remove the first and last event,
  which makes the check less good the fewer events we have.
- Add explicit comments of what each check does for readability.
This commit is contained in:
Pablo Marcos 2024-10-31 08:51:52 +00:00
parent d14ca872d4
commit e126092c1f
2 changed files with 74 additions and 37 deletions

View File

@ -1,12 +1,30 @@
number_of_metrics_1000_ok timestamp_diff_in_metrics_1000_ok
initial_data_1000_ok
data_1000_ok
number_of_metrics_1234_ok timestamp_diff_in_metrics_1234_ok
initial_data_1234_ok
data_1234_ok
number_of_metrics_123_ok timestamp_diff_in_metrics_123_ok
initial_data_123_ok
data_123_ok
--Interval 1000: check that amount of events is correct
1
--Interval 1000: check that the delta/diff between the events is correct
1
--Interval 1000: check that the Query, SelectQuery and InitialQuery values are correct for the first event
1
--Interval 1000: check that the SleepFunctionCalls, SleepFunctionMilliseconds and ProfileEvent_SleepFunctionElapsedMicroseconds are correct
1
--Interval 400: check that amount of events is correct
1
--Interval 400: check that the delta/diff between the events is correct
1
--Interval 400: check that the Query, SelectQuery and InitialQuery values are correct for the first event
1
--Interval 400: check that the SleepFunctionCalls, SleepFunctionMilliseconds and ProfileEvent_SleepFunctionElapsedMicroseconds are correct
1
--Interval 123: check that amount of events is correct
1
--Interval 123: check that the delta/diff between the events is correct
1
--Interval 123: check that the Query, SelectQuery and InitialQuery values are correct for the first event
1
--Interval 123: check that the SleepFunctionCalls, SleepFunctionMilliseconds and ProfileEvent_SleepFunctionElapsedMicroseconds are correct
1
--Check that a query_metric_log_interval=0 disables the collection
0
-Check that a query which execution time is less than query_metric_log_interval is never collected
0
--Check that there is a final event when queries finish
3

View File

@ -7,7 +7,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
readonly query_prefix=$CLICKHOUSE_DATABASE
$CLICKHOUSE_CLIENT --query-id="${query_prefix}_1000" -q "SELECT sleep(2.5) FORMAT Null" &
$CLICKHOUSE_CLIENT --query-id="${query_prefix}_1234" -q "SELECT sleep(2.5) SETTINGS query_metric_log_interval=1234 FORMAT Null" &
$CLICKHOUSE_CLIENT --query-id="${query_prefix}_400" -q "SELECT sleep(2.5) SETTINGS query_metric_log_interval=400 FORMAT Null" &
$CLICKHOUSE_CLIENT --query-id="${query_prefix}_123" -q "SELECT sleep(2.5) SETTINGS query_metric_log_interval=123 FORMAT Null" &
$CLICKHOUSE_CLIENT --query-id="${query_prefix}_0" -q "SELECT sleep(2.5) SETTINGS query_metric_log_interval=0 FORMAT Null" &
$CLICKHOUSE_CLIENT --query-id="${query_prefix}_fast" -q "SELECT sleep(0.1) FORMAT Null" &
@ -20,32 +20,42 @@ function check_log()
{
interval=$1
# Check that the amount of events collected is correct, leaving a 20% of margin.
$CLICKHOUSE_CLIENT -m -q """
SELECT '--Interval $interval: check that amount of events is correct';
SELECT
count() BETWEEN (ceil(2500 / $interval) * 0.8) AND (ceil(2500 / $interval) * 1.2)
FROM system.query_metric_log
WHERE event_date >= yesterday() AND query_id = '${query_prefix}_${interval}'
"""
# We calculate the diff of each row with its previous row to check whether the intervals at
# which data is collected is right. The first row is always skipped because the diff with the
# preceding one (itself) is 0. The last row is also skipped, because it doesn't contain a full
# interval.
$CLICKHOUSE_CLIENT --max_threads=1 -m -q """
WITH diff AS (
SELECT
row_number() OVER () AS row,
count() OVER () as total_rows,
event_time_microseconds,
first_value(event_time_microseconds) OVER (ORDER BY event_time_microseconds ROWS BETWEEN 1 PRECEDING AND 0 FOLLOWING) as prev,
dateDiff('ms', prev, event_time_microseconds) AS diff
FROM system.query_metric_log
WHERE event_date >= yesterday() AND query_id = '${query_prefix}_${interval}'
ORDER BY event_time_microseconds
OFFSET 1
)
SELECT if(count() BETWEEN ((ceil(2500 / $interval) - 2) * 0.8) AND ((ceil(2500 / $interval) - 2) * 1.2), 'number_of_metrics_${interval}_ok', 'number_of_metrics_${interval}_error'),
if(avg(diff) BETWEEN $interval * 0.8 AND $interval * 1.2, 'timestamp_diff_in_metrics_${interval}_ok', 'timestamp_diff_in_metrics_${interval}_error')
FROM diff WHERE row < total_rows
SELECT '--Interval $interval: check that the delta/diff between the events is correct';
WITH diff AS (
SELECT
row_number() OVER () AS row,
count() OVER () as total_rows,
event_time_microseconds,
first_value(event_time_microseconds) OVER (ORDER BY event_time_microseconds ROWS BETWEEN 1 PRECEDING AND 0 FOLLOWING) as prev,
dateDiff('ms', prev, event_time_microseconds) AS diff
FROM system.query_metric_log
WHERE event_date >= yesterday() AND query_id = '${query_prefix}_${interval}'
ORDER BY event_time_microseconds
OFFSET 1
)
SELECT avg(diff) BETWEEN $interval * 0.8 AND $interval * 1.2
FROM diff WHERE row < total_rows
"""
# Check that the first event contains information from the beginning of the query.
# Notice the rest of the events won't contain these because the diff will be 0.
$CLICKHOUSE_CLIENT -m -q """
SELECT if(ProfileEvent_Query = 1 AND ProfileEvent_SelectQuery = 1 AND ProfileEvent_InitialQuery = 1, 'initial_data_${interval}_ok', 'initial_data_${interval}_error')
SELECT '--Interval $interval: check that the Query, SelectQuery and InitialQuery values are correct for the first event';
SELECT ProfileEvent_Query = 1 AND ProfileEvent_SelectQuery = 1 AND ProfileEvent_InitialQuery = 1
FROM system.query_metric_log
WHERE event_date >= yesterday() AND query_id = '${query_prefix}_${interval}'
ORDER BY event_time_microseconds
@ -55,27 +65,36 @@ function check_log()
# Also check that it contains some data that we know it's going to be there.
# Notice the Sleep events can be in any of the rows, not only in the first one.
$CLICKHOUSE_CLIENT -m -q """
SELECT if(sum(ProfileEvent_SleepFunctionCalls) = 1 AND
sum(ProfileEvent_SleepFunctionMicroseconds) = 2500000 AND
sum(ProfileEvent_SleepFunctionElapsedMicroseconds) = 2500000 AND
sum(ProfileEvent_Query) = 1 AND
sum(ProfileEvent_SelectQuery) = 1 AND
sum(ProfileEvent_InitialQuery) = 1,
'data_${interval}_ok', 'data_${interval}_error')
SELECT '--Interval $interval: check that the SleepFunctionCalls, SleepFunctionMilliseconds and ProfileEvent_SleepFunctionElapsedMicroseconds are correct';
SELECT sum(ProfileEvent_SleepFunctionCalls) = 1 AND
sum(ProfileEvent_SleepFunctionMicroseconds) = 2500000 AND
sum(ProfileEvent_SleepFunctionElapsedMicroseconds) = 2500000 AND
sum(ProfileEvent_Query) = 1 AND
sum(ProfileEvent_SelectQuery) = 1 AND
sum(ProfileEvent_InitialQuery) = 1
FROM system.query_metric_log
WHERE event_date >= yesterday() AND query_id = '${query_prefix}_${interval}'
"""
}
check_log 1000
check_log 1234
check_log 400
check_log 123
# query_metric_log_interval=0 disables the collection altogether
$CLICKHOUSE_CLIENT -m -q """SELECT count() FROM system.query_metric_log WHERE event_date >= yesterday() AND query_id = '${query_prefix}_0'"""
$CLICKHOUSE_CLIENT -m -q """
SELECT '--Check that a query_metric_log_interval=0 disables the collection';
SELECT count() FROM system.query_metric_log WHERE event_date >= yesterday() AND query_id = '${query_prefix}_0'
"""
# a quick query that takes less than query_metric_log_interval is never collected
$CLICKHOUSE_CLIENT -m -q """SELECT count() FROM system.query_metric_log WHERE event_date >= yesterday() AND query_id = '${query_prefix}_fast'"""
$CLICKHOUSE_CLIENT -m -q """
SELECT '-Check that a query which execution time is less than query_metric_log_interval is never collected';
SELECT count() FROM system.query_metric_log WHERE event_date >= yesterday() AND query_id = '${query_prefix}_fast'
"""
# a query that takes more than query_metric_log_interval is collected including the final row
$CLICKHOUSE_CLIENT -m -q """SELECT count() FROM system.query_metric_log WHERE event_date >= yesterday() AND query_id = '${query_prefix}_1000'"""
$CLICKHOUSE_CLIENT -m -q """
SELECT '--Check that there is a final event when queries finish';
SELECT count() FROM system.query_metric_log WHERE event_date >= yesterday() AND query_id = '${query_prefix}_1000'
"""