mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-15 12:14:18 +00:00
Improve system.query_metric_log to remove flakiness
- Use an interval of 400ms instead of 1234ms for the 2500ms sleep to avoid having a last collection that may clash with the finish one. - Move the check for number of events to a separate check. This way we don't have to remove the first and last event, which makes the check less good the fewer events we have. - Add explicit comments of what each check does for readability.
This commit is contained in:
parent
d14ca872d4
commit
e126092c1f
@ -1,12 +1,30 @@
|
||||
number_of_metrics_1000_ok timestamp_diff_in_metrics_1000_ok
|
||||
initial_data_1000_ok
|
||||
data_1000_ok
|
||||
number_of_metrics_1234_ok timestamp_diff_in_metrics_1234_ok
|
||||
initial_data_1234_ok
|
||||
data_1234_ok
|
||||
number_of_metrics_123_ok timestamp_diff_in_metrics_123_ok
|
||||
initial_data_123_ok
|
||||
data_123_ok
|
||||
--Interval 1000: check that amount of events is correct
|
||||
1
|
||||
--Interval 1000: check that the delta/diff between the events is correct
|
||||
1
|
||||
--Interval 1000: check that the Query, SelectQuery and InitialQuery values are correct for the first event
|
||||
1
|
||||
--Interval 1000: check that the SleepFunctionCalls, SleepFunctionMilliseconds and ProfileEvent_SleepFunctionElapsedMicroseconds are correct
|
||||
1
|
||||
--Interval 400: check that amount of events is correct
|
||||
1
|
||||
--Interval 400: check that the delta/diff between the events is correct
|
||||
1
|
||||
--Interval 400: check that the Query, SelectQuery and InitialQuery values are correct for the first event
|
||||
1
|
||||
--Interval 400: check that the SleepFunctionCalls, SleepFunctionMilliseconds and ProfileEvent_SleepFunctionElapsedMicroseconds are correct
|
||||
1
|
||||
--Interval 123: check that amount of events is correct
|
||||
1
|
||||
--Interval 123: check that the delta/diff between the events is correct
|
||||
1
|
||||
--Interval 123: check that the Query, SelectQuery and InitialQuery values are correct for the first event
|
||||
1
|
||||
--Interval 123: check that the SleepFunctionCalls, SleepFunctionMilliseconds and ProfileEvent_SleepFunctionElapsedMicroseconds are correct
|
||||
1
|
||||
--Check that a query_metric_log_interval=0 disables the collection
|
||||
0
|
||||
-Check that a query which execution time is less than query_metric_log_interval is never collected
|
||||
0
|
||||
--Check that there is a final event when queries finish
|
||||
3
|
||||
|
@ -7,7 +7,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
readonly query_prefix=$CLICKHOUSE_DATABASE
|
||||
|
||||
$CLICKHOUSE_CLIENT --query-id="${query_prefix}_1000" -q "SELECT sleep(2.5) FORMAT Null" &
|
||||
$CLICKHOUSE_CLIENT --query-id="${query_prefix}_1234" -q "SELECT sleep(2.5) SETTINGS query_metric_log_interval=1234 FORMAT Null" &
|
||||
$CLICKHOUSE_CLIENT --query-id="${query_prefix}_400" -q "SELECT sleep(2.5) SETTINGS query_metric_log_interval=400 FORMAT Null" &
|
||||
$CLICKHOUSE_CLIENT --query-id="${query_prefix}_123" -q "SELECT sleep(2.5) SETTINGS query_metric_log_interval=123 FORMAT Null" &
|
||||
$CLICKHOUSE_CLIENT --query-id="${query_prefix}_0" -q "SELECT sleep(2.5) SETTINGS query_metric_log_interval=0 FORMAT Null" &
|
||||
$CLICKHOUSE_CLIENT --query-id="${query_prefix}_fast" -q "SELECT sleep(0.1) FORMAT Null" &
|
||||
@ -20,32 +20,42 @@ function check_log()
|
||||
{
|
||||
interval=$1
|
||||
|
||||
# Check that the amount of events collected is correct, leaving a 20% of margin.
|
||||
$CLICKHOUSE_CLIENT -m -q """
|
||||
SELECT '--Interval $interval: check that amount of events is correct';
|
||||
SELECT
|
||||
count() BETWEEN (ceil(2500 / $interval) * 0.8) AND (ceil(2500 / $interval) * 1.2)
|
||||
FROM system.query_metric_log
|
||||
WHERE event_date >= yesterday() AND query_id = '${query_prefix}_${interval}'
|
||||
"""
|
||||
|
||||
# We calculate the diff of each row with its previous row to check whether the intervals at
|
||||
# which data is collected is right. The first row is always skipped because the diff with the
|
||||
# preceding one (itself) is 0. The last row is also skipped, because it doesn't contain a full
|
||||
# interval.
|
||||
$CLICKHOUSE_CLIENT --max_threads=1 -m -q """
|
||||
WITH diff AS (
|
||||
SELECT
|
||||
row_number() OVER () AS row,
|
||||
count() OVER () as total_rows,
|
||||
event_time_microseconds,
|
||||
first_value(event_time_microseconds) OVER (ORDER BY event_time_microseconds ROWS BETWEEN 1 PRECEDING AND 0 FOLLOWING) as prev,
|
||||
dateDiff('ms', prev, event_time_microseconds) AS diff
|
||||
FROM system.query_metric_log
|
||||
WHERE event_date >= yesterday() AND query_id = '${query_prefix}_${interval}'
|
||||
ORDER BY event_time_microseconds
|
||||
OFFSET 1
|
||||
)
|
||||
SELECT if(count() BETWEEN ((ceil(2500 / $interval) - 2) * 0.8) AND ((ceil(2500 / $interval) - 2) * 1.2), 'number_of_metrics_${interval}_ok', 'number_of_metrics_${interval}_error'),
|
||||
if(avg(diff) BETWEEN $interval * 0.8 AND $interval * 1.2, 'timestamp_diff_in_metrics_${interval}_ok', 'timestamp_diff_in_metrics_${interval}_error')
|
||||
FROM diff WHERE row < total_rows
|
||||
SELECT '--Interval $interval: check that the delta/diff between the events is correct';
|
||||
WITH diff AS (
|
||||
SELECT
|
||||
row_number() OVER () AS row,
|
||||
count() OVER () as total_rows,
|
||||
event_time_microseconds,
|
||||
first_value(event_time_microseconds) OVER (ORDER BY event_time_microseconds ROWS BETWEEN 1 PRECEDING AND 0 FOLLOWING) as prev,
|
||||
dateDiff('ms', prev, event_time_microseconds) AS diff
|
||||
FROM system.query_metric_log
|
||||
WHERE event_date >= yesterday() AND query_id = '${query_prefix}_${interval}'
|
||||
ORDER BY event_time_microseconds
|
||||
OFFSET 1
|
||||
)
|
||||
SELECT avg(diff) BETWEEN $interval * 0.8 AND $interval * 1.2
|
||||
FROM diff WHERE row < total_rows
|
||||
"""
|
||||
|
||||
# Check that the first event contains information from the beginning of the query.
|
||||
# Notice the rest of the events won't contain these because the diff will be 0.
|
||||
$CLICKHOUSE_CLIENT -m -q """
|
||||
SELECT if(ProfileEvent_Query = 1 AND ProfileEvent_SelectQuery = 1 AND ProfileEvent_InitialQuery = 1, 'initial_data_${interval}_ok', 'initial_data_${interval}_error')
|
||||
SELECT '--Interval $interval: check that the Query, SelectQuery and InitialQuery values are correct for the first event';
|
||||
SELECT ProfileEvent_Query = 1 AND ProfileEvent_SelectQuery = 1 AND ProfileEvent_InitialQuery = 1
|
||||
FROM system.query_metric_log
|
||||
WHERE event_date >= yesterday() AND query_id = '${query_prefix}_${interval}'
|
||||
ORDER BY event_time_microseconds
|
||||
@ -55,27 +65,36 @@ function check_log()
|
||||
# Also check that it contains some data that we know it's going to be there.
|
||||
# Notice the Sleep events can be in any of the rows, not only in the first one.
|
||||
$CLICKHOUSE_CLIENT -m -q """
|
||||
SELECT if(sum(ProfileEvent_SleepFunctionCalls) = 1 AND
|
||||
sum(ProfileEvent_SleepFunctionMicroseconds) = 2500000 AND
|
||||
sum(ProfileEvent_SleepFunctionElapsedMicroseconds) = 2500000 AND
|
||||
sum(ProfileEvent_Query) = 1 AND
|
||||
sum(ProfileEvent_SelectQuery) = 1 AND
|
||||
sum(ProfileEvent_InitialQuery) = 1,
|
||||
'data_${interval}_ok', 'data_${interval}_error')
|
||||
SELECT '--Interval $interval: check that the SleepFunctionCalls, SleepFunctionMilliseconds and ProfileEvent_SleepFunctionElapsedMicroseconds are correct';
|
||||
SELECT sum(ProfileEvent_SleepFunctionCalls) = 1 AND
|
||||
sum(ProfileEvent_SleepFunctionMicroseconds) = 2500000 AND
|
||||
sum(ProfileEvent_SleepFunctionElapsedMicroseconds) = 2500000 AND
|
||||
sum(ProfileEvent_Query) = 1 AND
|
||||
sum(ProfileEvent_SelectQuery) = 1 AND
|
||||
sum(ProfileEvent_InitialQuery) = 1
|
||||
FROM system.query_metric_log
|
||||
WHERE event_date >= yesterday() AND query_id = '${query_prefix}_${interval}'
|
||||
"""
|
||||
}
|
||||
|
||||
check_log 1000
|
||||
check_log 1234
|
||||
check_log 400
|
||||
check_log 123
|
||||
|
||||
# query_metric_log_interval=0 disables the collection altogether
|
||||
$CLICKHOUSE_CLIENT -m -q """SELECT count() FROM system.query_metric_log WHERE event_date >= yesterday() AND query_id = '${query_prefix}_0'"""
|
||||
$CLICKHOUSE_CLIENT -m -q """
|
||||
SELECT '--Check that a query_metric_log_interval=0 disables the collection';
|
||||
SELECT count() FROM system.query_metric_log WHERE event_date >= yesterday() AND query_id = '${query_prefix}_0'
|
||||
"""
|
||||
|
||||
# a quick query that takes less than query_metric_log_interval is never collected
|
||||
$CLICKHOUSE_CLIENT -m -q """SELECT count() FROM system.query_metric_log WHERE event_date >= yesterday() AND query_id = '${query_prefix}_fast'"""
|
||||
$CLICKHOUSE_CLIENT -m -q """
|
||||
SELECT '-Check that a query which execution time is less than query_metric_log_interval is never collected';
|
||||
SELECT count() FROM system.query_metric_log WHERE event_date >= yesterday() AND query_id = '${query_prefix}_fast'
|
||||
"""
|
||||
|
||||
# a query that takes more than query_metric_log_interval is collected including the final row
|
||||
$CLICKHOUSE_CLIENT -m -q """SELECT count() FROM system.query_metric_log WHERE event_date >= yesterday() AND query_id = '${query_prefix}_1000'"""
|
||||
$CLICKHOUSE_CLIENT -m -q """
|
||||
SELECT '--Check that there is a final event when queries finish';
|
||||
SELECT count() FROM system.query_metric_log WHERE event_date >= yesterday() AND query_id = '${query_prefix}_1000'
|
||||
"""
|
||||
|
Loading…
Reference in New Issue
Block a user