mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-10 09:32:06 +00:00
Merge pull request #15230 from ClickHouse/aku/debug-perf
debug perf test run time
This commit is contained in:
commit
2523101302
@ -181,6 +181,9 @@ function run_tests
|
||||
# Randomize test order.
|
||||
test_files=$(for f in $test_files; do echo "$f"; done | sort -R)
|
||||
|
||||
# Limit profiling time to 10 minutes, not to run for too long.
|
||||
profile_seconds_left=600
|
||||
|
||||
# Run the tests.
|
||||
test_name="<none>"
|
||||
for test in $test_files
|
||||
@ -194,15 +197,24 @@ function run_tests
|
||||
test_name=$(basename "$test" ".xml")
|
||||
echo test "$test_name"
|
||||
|
||||
# Don't profile if we're past the time limit.
|
||||
# Use awk because bash doesn't support floating point arithmetics.
|
||||
profile_seconds=$(awk "BEGIN { print ($profile_seconds_left > 0 ? 10 : 0) }")
|
||||
|
||||
TIMEFORMAT=$(printf "$test_name\t%%3R\t%%3U\t%%3S\n")
|
||||
# The grep is to filter out set -x output and keep only time output.
|
||||
# The '2>&1 >/dev/null' redirects stderr to stdout, and discards stdout.
|
||||
{ \
|
||||
time "$script_dir/perf.py" --host localhost localhost --port 9001 9002 \
|
||||
--runs "$CHPC_RUNS" --max-queries "$CHPC_MAX_QUERIES" \
|
||||
--profile-seconds "$profile_seconds" \
|
||||
-- "$test" > "$test_name-raw.tsv" 2> "$test_name-err.log" ; \
|
||||
} 2>&1 >/dev/null | tee >(grep -v ^+ >> "wall-clock-times.tsv") \
|
||||
|| echo "Test $test_name failed with error code $?" >> "$test_name-err.log"
|
||||
|
||||
profile_seconds_left=$(awk -F' ' \
|
||||
'BEGIN { s = '$profile_seconds_left'; } /^profile-total/ { s -= $2 } END { print s }' \
|
||||
"$test_name-raw.tsv")
|
||||
done
|
||||
|
||||
unset TIMEFORMAT
|
||||
@ -294,6 +306,7 @@ for test_file in *-raw.tsv
|
||||
do
|
||||
test_name=$(basename "$test_file" "-raw.tsv")
|
||||
sed -n "s/^query\t/$test_name\t/p" < "$test_file" >> "analyze/query-runs.tsv"
|
||||
sed -n "s/^profile\t/$test_name\t/p" < "$test_file" >> "analyze/query-profiles.tsv"
|
||||
sed -n "s/^client-time\t/$test_name\t/p" < "$test_file" >> "analyze/client-times.tsv"
|
||||
sed -n "s/^report-threshold\t/$test_name\t/p" < "$test_file" >> "analyze/report-thresholds.tsv"
|
||||
sed -n "s/^skipped\t/$test_name\t/p" < "$test_file" >> "analyze/skipped-tests.tsv"
|
||||
@ -658,13 +671,15 @@ create view test_runs as
|
||||
group by test
|
||||
;
|
||||
|
||||
create table test_times_report engine File(TSV, 'report/test-times.tsv') as
|
||||
select wall_clock_time_per_test.test, real,
|
||||
toDecimal64(total_client_time, 3),
|
||||
create view test_times_view as
|
||||
select
|
||||
wall_clock_time_per_test.test test,
|
||||
real,
|
||||
total_client_time,
|
||||
queries,
|
||||
toDecimal64(query_max, 3),
|
||||
toDecimal64(real / queries, 3) avg_real_per_query,
|
||||
toDecimal64(query_min, 3),
|
||||
query_max,
|
||||
real / queries avg_real_per_query,
|
||||
query_min,
|
||||
runs
|
||||
from test_time
|
||||
-- wall clock times are also measured for skipped tests, so don't
|
||||
@ -673,7 +688,43 @@ create table test_times_report engine File(TSV, 'report/test-times.tsv') as
|
||||
on wall_clock_time_per_test.test = test_time.test
|
||||
full join test_runs
|
||||
on test_runs.test = test_time.test
|
||||
order by avg_real_per_query desc;
|
||||
;
|
||||
|
||||
-- WITH TOTALS doesn't work with INSERT SELECT, so we have to jump through these
|
||||
-- hoops: https://github.com/ClickHouse/ClickHouse/issues/15227
|
||||
create view test_times_view_total as
|
||||
select
|
||||
'Total' test,
|
||||
sum(real),
|
||||
sum(total_client_time),
|
||||
sum(queries),
|
||||
max(query_max),
|
||||
sum(real) / sum(queries) avg_real_per_query,
|
||||
min(query_min),
|
||||
-- Totaling the number of runs doesn't make sense, but use the max so
|
||||
-- that the reporting script doesn't complain about queries being too
|
||||
-- long.
|
||||
max(runs)
|
||||
from test_times_view
|
||||
;
|
||||
|
||||
create table test_times_report engine File(TSV, 'report/test-times.tsv') as
|
||||
select
|
||||
test,
|
||||
toDecimal64(real, 3),
|
||||
toDecimal64(total_client_time, 3),
|
||||
queries,
|
||||
toDecimal64(query_max, 3),
|
||||
toDecimal64(avg_real_per_query, 3),
|
||||
toDecimal64(query_min, 3),
|
||||
runs
|
||||
from (
|
||||
select * from test_times_view
|
||||
union all
|
||||
select * from test_times_view_total
|
||||
)
|
||||
order by test = 'Total' desc, avg_real_per_query desc
|
||||
;
|
||||
|
||||
-- report for all queries page, only main metric
|
||||
create table all_tests_report engine File(TSV, 'report/all-queries.tsv') as
|
||||
@ -694,13 +745,12 @@ create table all_tests_report engine File(TSV, 'report/all-queries.tsv') as
|
||||
test, query_index, query_display_name
|
||||
from queries order by test, query_index;
|
||||
|
||||
-- queries for which we will build flamegraphs (see below)
|
||||
create table queries_for_flamegraph engine File(TSVWithNamesAndTypes,
|
||||
'report/queries-for-flamegraph.tsv') as
|
||||
select test, query_index from queries where unstable_show or changed_show
|
||||
;
|
||||
|
||||
|
||||
-- Report of queries that have inconsistent 'short' markings:
|
||||
-- 1) have short duration, but are not marked as 'short'
|
||||
-- 2) the reverse -- marked 'short' but take too long.
|
||||
-- The threshold for 2) is significantly larger than the threshold for 1), to
|
||||
-- avoid jitter.
|
||||
create view shortness
|
||||
as select
|
||||
(test, query_index) in
|
||||
@ -718,11 +768,6 @@ create view shortness
|
||||
and times.query_index = query_display_names.query_index
|
||||
;
|
||||
|
||||
-- Report of queries that have inconsistent 'short' markings:
|
||||
-- 1) have short duration, but are not marked as 'short'
|
||||
-- 2) the reverse -- marked 'short' but take too long.
|
||||
-- The threshold for 2) is significantly larger than the threshold for 1), to
|
||||
-- avoid jitter.
|
||||
create table inconsistent_short_marking_report
|
||||
engine File(TSV, 'report/unexpected-query-duration.tsv')
|
||||
as select
|
||||
@ -759,18 +804,15 @@ create table all_query_metrics_tsv engine File(TSV, 'report/all-query-metrics.ts
|
||||
" 2> >(tee -a report/errors.log 1>&2)
|
||||
|
||||
|
||||
# Prepare source data for metrics and flamegraphs for unstable queries.
|
||||
# Prepare source data for metrics and flamegraphs for queries that were profiled
|
||||
# by perf.py.
|
||||
for version in {right,left}
|
||||
do
|
||||
rm -rf data
|
||||
clickhouse-local --query "
|
||||
create view queries_for_flamegraph as
|
||||
select * from file('report/queries-for-flamegraph.tsv', TSVWithNamesAndTypes,
|
||||
'test text, query_index int');
|
||||
|
||||
create view query_runs as
|
||||
create view query_profiles as
|
||||
with 0 as left, 1 as right
|
||||
select * from file('analyze/query-runs.tsv', TSV,
|
||||
select * from file('analyze/query-profiles.tsv', TSV,
|
||||
'test text, query_index int, query_id text, version UInt8, time float')
|
||||
where version = $version
|
||||
;
|
||||
@ -782,15 +824,12 @@ create view query_display_names as select * from
|
||||
|
||||
create table unstable_query_runs engine File(TSVWithNamesAndTypes,
|
||||
'unstable-query-runs.$version.rep') as
|
||||
select query_runs.test test, query_runs.query_index query_index,
|
||||
select query_profiles.test test, query_profiles.query_index query_index,
|
||||
query_display_name, query_id
|
||||
from query_runs
|
||||
join queries_for_flamegraph on
|
||||
query_runs.test = queries_for_flamegraph.test
|
||||
and query_runs.query_index = queries_for_flamegraph.query_index
|
||||
from query_profiles
|
||||
left join query_display_names on
|
||||
query_runs.test = query_display_names.test
|
||||
and query_runs.query_index = query_display_names.query_index
|
||||
query_profiles.test = query_display_names.test
|
||||
and query_profiles.query_index = query_display_names.query_index
|
||||
;
|
||||
|
||||
create view query_log as select *
|
||||
|
@ -18,9 +18,22 @@ import xml.etree.ElementTree as et
|
||||
from threading import Thread
|
||||
from scipy import stats
|
||||
|
||||
|
||||
total_start_seconds = time.perf_counter()
|
||||
stage_start_seconds = total_start_seconds
|
||||
|
||||
def reportStageEnd(stage):
|
||||
global stage_start_seconds, total_start_seconds
|
||||
|
||||
current = time.perf_counter()
|
||||
print(f'stage\t{stage}\t{current - stage_start_seconds:.3f}\t{current - total_start_seconds:.3f}')
|
||||
stage_start_seconds = current
|
||||
|
||||
|
||||
def tsv_escape(s):
|
||||
return s.replace('\\', '\\\\').replace('\t', '\\t').replace('\n', '\\n').replace('\r','')
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser(description='Run performance test.')
|
||||
# Explicitly decode files as UTF-8 because sometimes we have Russian characters in queries, and LANG=C is set.
|
||||
parser.add_argument('file', metavar='FILE', type=argparse.FileType('r', encoding='utf-8'), nargs=1, help='test description file')
|
||||
@ -29,16 +42,21 @@ parser.add_argument('--port', nargs='*', default=[9000], help="Space-separated l
|
||||
parser.add_argument('--runs', type=int, default=1, help='Number of query runs per server.')
|
||||
parser.add_argument('--max-queries', type=int, default=None, help='Test no more than this number of queries, chosen at random.')
|
||||
parser.add_argument('--queries-to-run', nargs='*', type=int, default=None, help='Space-separated list of indexes of queries to test.')
|
||||
parser.add_argument('--profile-seconds', type=int, default=0, help='For how many seconds to profile a query for which the performance has changed.')
|
||||
parser.add_argument('--long', action='store_true', help='Do not skip the tests tagged as long.')
|
||||
parser.add_argument('--print-queries', action='store_true', help='Print test queries and exit.')
|
||||
parser.add_argument('--print-settings', action='store_true', help='Print test settings and exit.')
|
||||
args = parser.parse_args()
|
||||
|
||||
reportStageEnd('start')
|
||||
|
||||
test_name = os.path.splitext(os.path.basename(args.file[0].name))[0]
|
||||
|
||||
tree = et.parse(args.file[0])
|
||||
root = tree.getroot()
|
||||
|
||||
reportStageEnd('parse')
|
||||
|
||||
# Process query parameters
|
||||
subst_elems = root.findall('substitutions/substitution')
|
||||
available_parameters = {} # { 'table': ['hits_10m', 'hits_100m'], ... }
|
||||
@ -112,15 +130,21 @@ if not args.long:
|
||||
sys.exit(0)
|
||||
|
||||
# Print report threshold for the test if it is set.
|
||||
ignored_relative_change = 0.05
|
||||
if 'max_ignored_relative_change' in root.attrib:
|
||||
print(f'report-threshold\t{root.attrib["max_ignored_relative_change"]}')
|
||||
ignored_relative_change = float(root.attrib["max_ignored_relative_change"])
|
||||
print(f'report-threshold\t{ignored_relative_change}')
|
||||
|
||||
reportStageEnd('before-connect')
|
||||
|
||||
# Open connections
|
||||
servers = [{'host': host, 'port': port} for (host, port) in zip(args.host, args.port)]
|
||||
servers = [{'host': host or args.host[0], 'port': port or args.port[0]} for (host, port) in itertools.zip_longest(args.host, args.port)]
|
||||
all_connections = [clickhouse_driver.Client(**server) for server in servers]
|
||||
|
||||
for s in servers:
|
||||
print('server\t{}\t{}'.format(s['host'], s['port']))
|
||||
for i, s in enumerate(servers):
|
||||
print(f'server\t{i}\t{s["host"]}\t{s["port"]}')
|
||||
|
||||
reportStageEnd('connect')
|
||||
|
||||
# Run drop queries, ignoring errors. Do this before all other activity, because
|
||||
# clickhouse_driver disconnects on error (this is not configurable), and the new
|
||||
@ -135,6 +159,8 @@ for conn_index, c in enumerate(all_connections):
|
||||
except:
|
||||
pass
|
||||
|
||||
reportStageEnd('drop-1')
|
||||
|
||||
# Apply settings.
|
||||
# If there are errors, report them and continue -- maybe a new test uses a setting
|
||||
# that is not in master, but the queries can still run. If we have multiple
|
||||
@ -152,6 +178,8 @@ for conn_index, c in enumerate(all_connections):
|
||||
except:
|
||||
print(traceback.format_exc(), file=sys.stderr)
|
||||
|
||||
reportStageEnd('settings')
|
||||
|
||||
# Check tables that should exist. If they don't exist, just skip this test.
|
||||
tables = [e.text for e in root.findall('preconditions/table_exists')]
|
||||
for t in tables:
|
||||
@ -164,6 +192,8 @@ for t in tables:
|
||||
print(f'skipped\t{tsv_escape(skipped_message)}')
|
||||
sys.exit(0)
|
||||
|
||||
reportStageEnd('preconditions')
|
||||
|
||||
# Run create and fill queries. We will run them simultaneously for both servers,
|
||||
# to save time.
|
||||
# The weird search is to keep the relative order of elements, which matters, and
|
||||
@ -194,6 +224,9 @@ for t in threads:
|
||||
for t in threads:
|
||||
t.join()
|
||||
|
||||
reportStageEnd('create')
|
||||
|
||||
# By default, test all queries.
|
||||
queries_to_run = range(0, len(test_queries))
|
||||
|
||||
if args.max_queries:
|
||||
@ -205,6 +238,7 @@ if args.queries_to_run:
|
||||
queries_to_run = args.queries_to_run
|
||||
|
||||
# Run test queries.
|
||||
profile_total_seconds = 0
|
||||
for query_index in queries_to_run:
|
||||
q = test_queries[query_index]
|
||||
query_prefix = f'{test_name}.query{query_index}'
|
||||
@ -324,34 +358,49 @@ for query_index in queries_to_run:
|
||||
client_seconds = time.perf_counter() - start_seconds
|
||||
print(f'client-time\t{query_index}\t{client_seconds}\t{server_seconds}')
|
||||
|
||||
#print(all_server_times)
|
||||
#print(stats.ttest_ind(all_server_times[0], all_server_times[1], equal_var = False).pvalue)
|
||||
|
||||
# Run additional profiling queries to collect profile data, but only if test times appeared to be different.
|
||||
# We have to do it after normal runs because otherwise it will affect test statistics too much
|
||||
if len(all_server_times) == 2 and stats.ttest_ind(all_server_times[0], all_server_times[1], equal_var = False).pvalue < 0.1:
|
||||
run = 0
|
||||
while True:
|
||||
run_id = f'{query_prefix}.profile{run}'
|
||||
if len(all_server_times) != 2:
|
||||
continue
|
||||
|
||||
for conn_index, c in enumerate(this_query_connections):
|
||||
try:
|
||||
res = c.execute(q, query_id = run_id, settings = {'query_profiler_real_time_period_ns': 10000000})
|
||||
print(f'profile\t{query_index}\t{run_id}\t{conn_index}\t{c.last_query.elapsed}')
|
||||
except Exception as e:
|
||||
# Add query id to the exception to make debugging easier.
|
||||
e.args = (run_id, *e.args)
|
||||
e.message = run_id + ': ' + e.message
|
||||
raise
|
||||
if len(all_server_times[0]) < 3:
|
||||
# Don't fail if for some reason there are not enough measurements.
|
||||
continue
|
||||
|
||||
elapsed = c.last_query.elapsed
|
||||
profile_seconds += elapsed
|
||||
pvalue = stats.ttest_ind(all_server_times[0], all_server_times[1], equal_var = False).pvalue
|
||||
median = [statistics.median(t) for t in all_server_times]
|
||||
# Keep this consistent with the value used in report. Should eventually move
|
||||
# to (median[1] - median[0]) / min(median), which is compatible with "times"
|
||||
# difference we use in report (max(median) / min(median)).
|
||||
relative_diff = (median[1] - median[0]) / median[0]
|
||||
print(f'diff\t{query_index}\t{median[0]}\t{median[1]}\t{relative_diff}\t{pvalue}')
|
||||
if abs(relative_diff) < ignored_relative_change or pvalue > 0.05:
|
||||
continue
|
||||
|
||||
run += 1
|
||||
# Don't spend too much time for profile runs
|
||||
if run > args.runs or profile_seconds > 10:
|
||||
break
|
||||
# And don't bother with short queries
|
||||
# Perform profile runs for fixed amount of time. Don't limit the number
|
||||
# of runs, because we also have short queries.
|
||||
profile_start_seconds = time.perf_counter()
|
||||
run = 0
|
||||
while time.perf_counter() - profile_start_seconds < args.profile_seconds:
|
||||
run_id = f'{query_prefix}.profile{run}'
|
||||
|
||||
for conn_index, c in enumerate(this_query_connections):
|
||||
try:
|
||||
res = c.execute(q, query_id = run_id, settings = {'query_profiler_real_time_period_ns': 10000000})
|
||||
print(f'profile\t{query_index}\t{run_id}\t{conn_index}\t{c.last_query.elapsed}')
|
||||
except Exception as e:
|
||||
# Add query id to the exception to make debugging easier.
|
||||
e.args = (run_id, *e.args)
|
||||
e.message = run_id + ': ' + e.message
|
||||
raise
|
||||
|
||||
run += 1
|
||||
|
||||
profile_total_seconds += time.perf_counter() - profile_start_seconds
|
||||
|
||||
print(f'profile-total\t{profile_total_seconds}')
|
||||
|
||||
reportStageEnd('run')
|
||||
|
||||
# Run drop queries
|
||||
drop_queries = substitute_parameters(drop_query_templates)
|
||||
@ -359,3 +408,5 @@ for conn_index, c in enumerate(all_connections):
|
||||
for q in drop_queries:
|
||||
c.execute(q)
|
||||
print(f'drop\t{conn_index}\t{c.last_query.elapsed}\t{tsv_escape(q)}')
|
||||
|
||||
reportStageEnd('drop-2')
|
||||
|
@ -487,7 +487,7 @@ if args.report == 'main':
|
||||
for r in rows:
|
||||
anchor = f'{currentTableAnchor()}.{r[0]}'
|
||||
total_runs = (int(r[7]) + 1) * 2 # one prewarm run, two servers
|
||||
if float(r[5]) > allowed_average_run_time * total_runs:
|
||||
if r[0] != 'Total' and float(r[5]) > allowed_average_run_time * total_runs:
|
||||
# FIXME should be 15s max -- investigate parallel_insert
|
||||
slow_average_tests += 1
|
||||
attrs[5] = f'style="background: {color_bad}"'
|
||||
@ -495,7 +495,7 @@ if args.report == 'main':
|
||||
else:
|
||||
attrs[5] = ''
|
||||
|
||||
if float(r[4]) > allowed_single_run_time * total_runs:
|
||||
if r[0] != 'Total' and float(r[4]) > allowed_single_run_time * total_runs:
|
||||
slow_average_tests += 1
|
||||
attrs[4] = f'style="background: {color_bad}"'
|
||||
errors_explained.append([f'<a href="./all-queries.html#all-query-times.{r[0]}.0">Some query of the test \'{r[0]}\' is too slow to run. See the all queries report'])
|
||||
|
Loading…
Reference in New Issue
Block a user