2020-08-28 23:25:30 +00:00
#!/usr/bin/env bash
2021-09-12 12:35:27 +00:00
# Tags: distributed
2020-08-28 23:25:30 +00:00
set -ue
2021-02-10 18:09:13 +00:00
unset CLICKHOUSE_LOG_COMMENT
2021-01-30 23:22:55 +00:00
2020-08-28 23:25:30 +00:00
CURDIR = $( cd " $( dirname " ${ BASH_SOURCE [0] } " ) " && pwd )
2020-12-28 11:46:53 +00:00
# shellcheck source=../shell_config.sh
2020-08-28 23:25:30 +00:00
. " $CURDIR " /../shell_config.sh
function check_log
{
2020-10-20 11:35:13 +00:00
${ CLICKHOUSE_CLIENT } --format= JSONEachRow -nq "
2023-03-04 17:46:40 +00:00
set allow_experimental_analyzer = 1;
2020-08-28 23:25:30 +00:00
system flush logs;
2021-08-01 14:12:34 +00:00
-- Show queries sorted by start time.
select attribute[ 'db.statement' ] as query,
attribute[ 'clickhouse.query_status' ] as status,
attribute[ 'clickhouse.tracestate' ] as tracestate,
1 as sorted_by_start_time
from system.opentelemetry_span_log
2022-01-20 13:23:15 +00:00
where trace_id = UUIDNumToString( toFixedString( unhex( '$trace_id' ) , 16) )
2021-08-01 14:12:34 +00:00
and operation_name = 'query'
order by start_time_us
;
-- Show queries sorted by finish time.
select attribute[ 'db.statement' ] as query,
attribute[ 'clickhouse.query_status' ] as query_status,
attribute[ 'clickhouse.tracestate' ] as tracestate,
1 as sorted_by_finish_time
from system.opentelemetry_span_log
2022-01-20 13:23:15 +00:00
where trace_id = UUIDNumToString( toFixedString( unhex( '$trace_id' ) , 16) )
2021-08-01 14:12:34 +00:00
and operation_name = 'query'
order by finish_time_us
;
2020-10-20 11:35:13 +00:00
-- Check the number of query spans with given trace id, to verify it was
-- propagated.
select count( *) "'" '"total spans"' "'" ,
uniqExact( span_id) "'" '"unique spans"' "'" ,
uniqExactIf( parent_span_id, parent_span_id != 0)
"'" '"unique non-zero parent spans"' "'"
2020-10-22 16:47:20 +00:00
from system.opentelemetry_span_log
2022-01-20 13:23:15 +00:00
where trace_id = UUIDNumToString( toFixedString( unhex( '$trace_id' ) , 16) )
2020-08-28 23:25:30 +00:00
and operation_name = 'query'
;
2020-10-20 11:35:13 +00:00
-- Also check that the initial query span in ClickHouse has proper parent span.
2022-07-09 12:20:04 +00:00
-- the first span should be child of input trace context
-- the 2nd span should be the 'query' span
2020-10-20 11:35:13 +00:00
select count( *) "'" '"initial query spans with proper parent"' "'"
2022-07-09 12:20:04 +00:00
from system.opentelemetry_span_log
2021-01-26 19:56:11 +00:00
where
2022-01-20 13:23:15 +00:00
trace_id = UUIDNumToString( toFixedString( unhex( '$trace_id' ) , 16) )
2020-10-20 11:35:13 +00:00
and operation_name = 'query'
2023-03-04 17:46:40 +00:00
and parent_span_id in (
2022-07-09 12:20:04 +00:00
select span_id from system.opentelemetry_span_log where trace_id = UUIDNumToString( toFixedString( unhex( '$trace_id' ) , 16) ) and parent_span_id = reinterpretAsUInt64( unhex( '73' ) )
)
2020-10-20 11:35:13 +00:00
;
2020-08-28 23:25:30 +00:00
-- Check that the tracestate header was propagated. It must have exactly the
-- same non-empty value for all 'query' spans in this trace.
2020-10-20 11:35:13 +00:00
select uniqExact( value) "'" '"unique non-empty tracestate values"' "'"
2020-10-22 16:47:20 +00:00
from system.opentelemetry_span_log
2021-01-25 10:54:11 +00:00
array join mapKeys( attribute) as name, mapValues( attribute) as value
2020-08-28 23:25:30 +00:00
where
2022-01-20 13:23:15 +00:00
trace_id = UUIDNumToString( toFixedString( unhex( '$trace_id' ) , 16) )
2020-08-28 23:25:30 +00:00
and operation_name = 'query'
2020-09-08 13:19:27 +00:00
and name = 'clickhouse.tracestate'
2020-08-28 23:25:30 +00:00
and length( value) > 0
;
"
}
# Generate some random trace id so that the prevous runs of the test do not interfere.
2020-09-08 13:19:27 +00:00
echo "===http==="
2023-03-04 17:46:40 +00:00
trace_id = $( ${ CLICKHOUSE_CLIENT } -q "select lower(hex(reverse(reinterpretAsString(generateUUIDv4())))) settings allow_experimental_analyzer = 1" )
2020-08-28 23:25:30 +00:00
2020-10-20 11:35:13 +00:00
# Check that the HTTP traceparent is read, and then passed through `remote`
# table function. We expect 4 queries -- one initial, one SELECT and two
# DESC TABLE. Two DESC TABLE instead of one looks like a bug, see the issue:
# https://github.com/ClickHouse/ClickHouse/issues/14228
2020-09-08 13:19:27 +00:00
${ CLICKHOUSE_CURL } \
2020-10-20 11:35:13 +00:00
--header " traceparent: 00- $trace_id -0000000000000073-01 " \
2021-01-26 19:56:11 +00:00
--header "tracestate: some custom state" " $CLICKHOUSE_URL " \
2020-09-08 13:19:27 +00:00
--get \
2023-03-04 17:46:40 +00:00
--data-urlencode "query=select 1 from remote('127.0.0.2', system, one) settings allow_experimental_analyzer = 1 format Null"
2020-08-28 23:25:30 +00:00
check_log
# With another trace id, check that clickhouse-client accepts traceparent, and
# that it is passed through URL table function. We expect two query spans, one
# for the initial query, and one for the HTTP query.
2020-09-08 13:19:27 +00:00
echo "===native==="
2020-08-28 23:25:30 +00:00
trace_id = $( ${ CLICKHOUSE_CLIENT } -q "select lower(hex(reverse(reinterpretAsString(generateUUIDv4()))))" )
2020-09-08 13:19:27 +00:00
${ CLICKHOUSE_CLIENT } \
2020-10-20 11:35:13 +00:00
--opentelemetry-traceparent " 00- $trace_id -0000000000000073-01 " \
2020-09-08 13:19:27 +00:00
--opentelemetry-tracestate "another custom state" \
2020-10-20 11:35:13 +00:00
--query "select * from url('http://127.0.0.2:8123/?query=select%201%20format%20Null', CSV, 'a int')"
2020-08-28 23:25:30 +00:00
check_log
2020-09-08 13:19:27 +00:00
2020-10-20 11:35:13 +00:00
# Test sampled tracing. The traces should be started with the specified
# probability, only for initial queries.
2020-09-08 13:19:27 +00:00
echo "===sampled==="
query_id = $( ${ CLICKHOUSE_CLIENT } -q "select lower(hex(reverse(reinterpretAsString(generateUUIDv4()))))" )
2021-08-01 14:12:34 +00:00
for i in { 1..20}
2020-09-08 13:19:27 +00:00
do
${ CLICKHOUSE_CLIENT } \
2021-08-01 14:12:34 +00:00
--opentelemetry_start_trace_probability= 0.5 \
2020-10-15 17:27:18 +00:00
--query_id " $query_id - $i " \
--query "select 1 from remote('127.0.0.2', system, one) format Null" \
&
2020-10-20 11:35:13 +00:00
# clickhouse-client is slow to start (initialization of DateLUT), so run
# several clients in parallel, but not too many.
2020-10-15 17:27:18 +00:00
if [ [ $(( i % 10 )) -eq 0 ] ]
then
wait
fi
2020-09-08 13:19:27 +00:00
done
2020-10-15 17:27:18 +00:00
wait
2020-09-08 13:19:27 +00:00
2020-10-15 17:27:18 +00:00
${ CLICKHOUSE_CLIENT } -q "system flush logs"
2020-09-08 13:19:27 +00:00
${ CLICKHOUSE_CLIENT } -q "
2021-08-01 14:12:34 +00:00
-- expect 20 * 0.5 = 10 sampled events on average
select if ( 2 <= count( ) and count( ) <= 18, 'OK' , 'Fail' )
2020-10-22 16:47:20 +00:00
from system.opentelemetry_span_log
2021-01-25 10:54:11 +00:00
where operation_name = 'query'
and attribute[ 'clickhouse.query_id' ] like '$query_id-%'
2020-09-08 13:19:27 +00:00
;
"