ClickHouse/tests/queries/0_stateless/01455_opentelemetry_distributed.sh
2021-08-18 14:24:52 +03:00

141 lines
5.1 KiB
Bash
Executable File

#!/usr/bin/env bash
set -ue
unset CLICKHOUSE_LOG_COMMENT
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
function check_log
{
${CLICKHOUSE_CLIENT} --format=JSONEachRow -nq "
system flush logs;
-- Show queries sorted by start time.
select attribute['db.statement'] as query,
attribute['clickhouse.query_status'] as status,
attribute['clickhouse.tracestate'] as tracestate,
1 as sorted_by_start_time
from system.opentelemetry_span_log
where trace_id = reinterpretAsUUID(reverse(unhex('$trace_id')))
and operation_name = 'query'
order by start_time_us
;
-- Show queries sorted by finish time.
select attribute['db.statement'] as query,
attribute['clickhouse.query_status'] as query_status,
attribute['clickhouse.tracestate'] as tracestate,
1 as sorted_by_finish_time
from system.opentelemetry_span_log
where trace_id = reinterpretAsUUID(reverse(unhex('$trace_id')))
and operation_name = 'query'
order by finish_time_us
;
-- Check the number of query spans with given trace id, to verify it was
-- propagated.
select count(*) "'"'"total spans"'"'",
uniqExact(span_id) "'"'"unique spans"'"'",
uniqExactIf(parent_span_id, parent_span_id != 0)
"'"'"unique non-zero parent spans"'"'"
from system.opentelemetry_span_log
where trace_id = reinterpretAsUUID(reverse(unhex('$trace_id')))
and operation_name = 'query'
;
-- Also check that the initial query span in ClickHouse has proper parent span.
select count(*) "'"'"initial query spans with proper parent"'"'"
from
(select *, attribute_name, attribute_value
from system.opentelemetry_span_log
array join mapKeys(attribute) as attribute_name,
mapValues(attribute) as attribute_value) o
join system.query_log on query_id = o.attribute_value
where
trace_id = reinterpretAsUUID(reverse(unhex('$trace_id')))
and current_database = currentDatabase()
and operation_name = 'query'
and parent_span_id = reinterpretAsUInt64(unhex('73'))
and o.attribute_name = 'clickhouse.query_id'
and is_initial_query
and type = 'QueryFinish'
;
-- Check that the tracestate header was propagated. It must have exactly the
-- same non-empty value for all 'query' spans in this trace.
select uniqExact(value) "'"'"unique non-empty tracestate values"'"'"
from system.opentelemetry_span_log
array join mapKeys(attribute) as name, mapValues(attribute) as value
where
trace_id = reinterpretAsUUID(reverse(unhex('$trace_id')))
and operation_name = 'query'
and name = 'clickhouse.tracestate'
and length(value) > 0
;
"
}
# Generate some random trace id so that the prevous runs of the test do not interfere.
echo "===http==="
trace_id=$(${CLICKHOUSE_CLIENT} -q "select lower(hex(reverse(reinterpretAsString(generateUUIDv4()))))")
# Check that the HTTP traceparent is read, and then passed through `remote`
# table function. We expect 4 queries -- one initial, one SELECT and two
# DESC TABLE. Two DESC TABLE instead of one looks like a bug, see the issue:
# https://github.com/ClickHouse/ClickHouse/issues/14228
${CLICKHOUSE_CURL} \
--header "traceparent: 00-$trace_id-0000000000000073-01" \
--header "tracestate: some custom state" "$CLICKHOUSE_URL" \
--get \
--data-urlencode "query=select 1 from remote('127.0.0.2', system, one) format Null"
check_log
# With another trace id, check that clickhouse-client accepts traceparent, and
# that it is passed through URL table function. We expect two query spans, one
# for the initial query, and one for the HTTP query.
echo "===native==="
trace_id=$(${CLICKHOUSE_CLIENT} -q "select lower(hex(reverse(reinterpretAsString(generateUUIDv4()))))")
${CLICKHOUSE_CLIENT} \
--opentelemetry-traceparent "00-$trace_id-0000000000000073-01" \
--opentelemetry-tracestate "another custom state" \
--query "select * from url('http://127.0.0.2:8123/?query=select%201%20format%20Null', CSV, 'a int')"
check_log
# Test sampled tracing. The traces should be started with the specified
# probability, only for initial queries.
echo "===sampled==="
query_id=$(${CLICKHOUSE_CLIENT} -q "select lower(hex(reverse(reinterpretAsString(generateUUIDv4()))))")
for i in {1..20}
do
${CLICKHOUSE_CLIENT} \
--opentelemetry_start_trace_probability=0.5 \
--query_id "$query_id-$i" \
--query "select 1 from remote('127.0.0.2', system, one) format Null" \
&
# clickhouse-client is slow to start (initialization of DateLUT), so run
# several clients in parallel, but not too many.
if [[ $((i % 10)) -eq 0 ]]
then
wait
fi
done
wait
${CLICKHOUSE_CLIENT} -q "system flush logs"
${CLICKHOUSE_CLIENT} -q "
-- expect 20 * 0.5 = 10 sampled events on average
select if(2 <= count() and count() <= 18, 'OK', 'Fail')
from system.opentelemetry_span_log
where operation_name = 'query'
and parent_span_id = 0 -- only account for the initial queries
and attribute['clickhouse.query_id'] like '$query_id-%'
;
"