Merge pull request #47499 from FrankChen021/span_kind

This commit is contained in:
Vladimir C 2023-03-13 11:35:03 +01:00 committed by GitHub
commit 9cb018545c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 99 additions and 6 deletions

View File

@ -15,6 +15,13 @@ Columns:
- `operation_name` ([String](../../sql-reference/data-types/string.md)) — The name of the operation.
- `kind` ([Enum8](../../sql-reference/data-types/enum.md)) — The [SpanKind](https://opentelemetry.io/docs/reference/specification/trace/api/#spankind) of the span.
- `INTERNAL` — Indicates that the span represents an internal operation within an application.
- `SERVER` — Indicates that the span covers server-side handling of a synchronous RPC or other remote request.
- `CLIENT` — Indicates that the span describes a request to some remote service.
- `PRODUCER` — Indicates that the span describes the initiators of an asynchronous request. This parent span will often end before the corresponding child CONSUMER span, possibly even before the child span starts.
- `CONSUMER` - Indicates that the span describes a child of an asynchronous PRODUCER request.
- `start_time_us` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The start time of the `trace span` (in microseconds).
- `finish_time_us` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The finish time of the `trace span` (in microseconds).
@ -42,6 +49,7 @@ trace_id: cdab0847-0d62-61d5-4d38-dd65b19a1914
span_id: 701487461015578150
parent_span_id: 2991972114672045096
operation_name: DB::Block DB::InterpreterSelectQuery::getSampleBlockImpl()
kind: INTERNAL
start_time_us: 1612374594529090
finish_time_us: 1612374594529108
finish_date: 2021-02-03

View File

@ -506,7 +506,7 @@ void Connection::sendQuery(
bool with_pending_data,
std::function<void(const Progress &)>)
{
OpenTelemetry::SpanHolder span("Connection::sendQuery()");
OpenTelemetry::SpanHolder span("Connection::sendQuery()", OpenTelemetry::CLIENT);
span.addAttribute("clickhouse.query_id", query_id_);
span.addAttribute("clickhouse.query", query);
span.addAttribute("target", [this] () { return this->getHost() + ":" + std::to_string(this->getPort()); });

View File

@ -92,7 +92,7 @@ bool Span::addAttributeImpl(std::string_view name, std::string_view value) noexc
return true;
}
SpanHolder::SpanHolder(std::string_view _operation_name)
SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind)
{
if (!current_thread_trace_context.isTraceEnabled())
{
@ -106,6 +106,7 @@ SpanHolder::SpanHolder(std::string_view _operation_name)
this->parent_span_id = current_thread_trace_context.span_id;
this->span_id = thread_local_rng(); // create a new id for this span
this->operation_name = _operation_name;
this->kind = _kind;
this->start_time_us
= std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();

View File

@ -13,6 +13,29 @@ class ReadBuffer;
namespace OpenTelemetry
{
/// See https://opentelemetry.io/docs/reference/specification/trace/api/#spankind
enum SpanKind
{
/// Default value. Indicates that the span represents an internal operation within an application,
/// as opposed to an operations with remote parents or children.
INTERNAL = 0,
/// Indicates that the span covers server-side handling of a synchronous RPC or other remote request.
/// This span is often the child of a remote CLIENT span that was expected to wait for a response.
SERVER = 1,
/// Indicates that the span describes a request to some remote service.
/// This span is usually the parent of a remote SERVER span and does not end until the response is received.
CLIENT = 2,
/// Indicates that the span describes the initiators of an asynchronous request. This parent span will often end before the corresponding child CONSUMER span, possibly even before the child span starts.
/// In messaging scenarios with batching, tracing individual messages requires a new PRODUCER span per message to be created.
PRODUCER = 3,
/// Indicates that the span describes a child of an asynchronous PRODUCER request
CONSUMER = 4
};
struct Span
{
UUID trace_id{};
@ -21,6 +44,7 @@ struct Span
String operation_name;
UInt64 start_time_us = 0;
UInt64 finish_time_us = 0;
SpanKind kind = INTERNAL;
Map attributes;
/// Following methods are declared as noexcept to make sure they're exception safe.
@ -155,7 +179,7 @@ using TracingContextHolderPtr = std::unique_ptr<TracingContextHolder>;
/// Once it's created or destructed, it automatically maitains the tracing context on the thread that it lives.
struct SpanHolder : public Span
{
SpanHolder(std::string_view);
SpanHolder(std::string_view, SpanKind _kind = INTERNAL);
~SpanHolder();
/// Finish a span explicitly if needed.

View File

@ -542,6 +542,7 @@ void DDLWorker::processTask(DDLTaskBase & task, const ZooKeeperPtr & zookeeper)
OpenTelemetry::TracingContextHolder tracing_ctx_holder(__PRETTY_FUNCTION__ ,
task.entry.tracing_context,
this->context->getOpenTelemetrySpanLog());
tracing_ctx_holder.root_span.kind = OpenTelemetry::CONSUMER;
String active_node_path = task.getActiveNodePath();
String finished_node_path = task.getFinishedNodePath();

View File

@ -8,6 +8,7 @@
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeUUID.h>
#include <DataTypes/DataTypeEnum.h>
#include <Interpreters/Context.h>
#include <base/hex.h>
@ -20,11 +21,23 @@ namespace DB
NamesAndTypesList OpenTelemetrySpanLogElement::getNamesAndTypes()
{
auto span_kind_type = std::make_shared<DataTypeEnum8>(
DataTypeEnum8::Values
{
{"INTERNAL", static_cast<Int8>(OpenTelemetry::INTERNAL)},
{"SERVER", static_cast<Int8>(OpenTelemetry::SERVER)},
{"CLIENT", static_cast<Int8>(OpenTelemetry::CLIENT)},
{"PRODUCER", static_cast<Int8>(OpenTelemetry::PRODUCER)},
{"CONSUMER", static_cast<Int8>(OpenTelemetry::CONSUMER)}
}
);
return {
{"trace_id", std::make_shared<DataTypeUUID>()},
{"span_id", std::make_shared<DataTypeUInt64>()},
{"parent_span_id", std::make_shared<DataTypeUInt64>()},
{"operation_name", std::make_shared<DataTypeString>()},
{"kind", std::move(span_kind_type)},
// DateTime64 is really unwieldy -- there is no "normal" way to convert
// it to an UInt64 count of microseconds, except:
// 1) reinterpretAsUInt64(reinterpretAsFixedString(date)), which just
@ -59,6 +72,7 @@ void OpenTelemetrySpanLogElement::appendToBlock(MutableColumns & columns) const
columns[i++]->insert(span_id);
columns[i++]->insert(parent_span_id);
columns[i++]->insert(operation_name);
columns[i++]->insert(kind);
columns[i++]->insert(start_time_us);
columns[i++]->insert(finish_time_us);
columns[i++]->insert(DateLUT::instance().toDayNum(finish_time_us / 1000000).toUnderType());

View File

@ -55,7 +55,7 @@ bool isSupportedAlterType(int type)
BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, ContextPtr context, const DDLQueryOnClusterParams & params)
{
OpenTelemetry::SpanHolder span(__FUNCTION__);
OpenTelemetry::SpanHolder span(__FUNCTION__, OpenTelemetry::PRODUCER);
if (context->getCurrentTransaction() && context->getSettingsRef().throw_on_unsupported_query_inside_transaction)
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "ON CLUSTER queries inside transactions are not supported");

View File

@ -843,6 +843,7 @@ namespace
query_context->getClientInfo().client_trace_context,
query_context->getSettingsRef(),
query_context->getOpenTelemetrySpanLog());
thread_trace_context->root_span.kind = OpenTelemetry::SERVER;
/// Prepare for sending exceptions and logs.
const Settings & settings = query_context->getSettingsRef();

View File

@ -1001,6 +1001,7 @@ void HTTPHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse
client_info.client_trace_context,
context->getSettingsRef(),
context->getOpenTelemetrySpanLog());
thread_trace_context->root_span.kind = OpenTelemetry::SERVER;
thread_trace_context->root_span.addAttribute("clickhouse.uri", request.getURI());
response.setContentType("text/plain; charset=UTF-8");

View File

@ -277,6 +277,7 @@ void TCPHandler::runImpl()
query_context->getClientInfo().client_trace_context,
query_context->getSettingsRef(),
query_context->getOpenTelemetrySpanLog());
thread_trace_context->root_span.kind = OpenTelemetry::SERVER;
query_scope.emplace(query_context, /* fatal_error_callback */ [this]
{

View File

@ -1,8 +1,18 @@
===1===
{"operation_name":"void DB::DistributedSink::writeToLocal(const Cluster::ShardInfo &, const DB::Block &, size_t)","cluster":"test_cluster_two_shards_localhost","shard":"1","rows":"1","bytes":"8"}
{"operation_name":"void DB::DistributedSink::writeToLocal(const Cluster::ShardInfo &, const DB::Block &, size_t)","cluster":"test_cluster_two_shards_localhost","shard":"2","rows":"1","bytes":"8"}
1
===2===
{"operation_name":"void DB::DistributedAsyncInsertDirectoryQueue::processFile(const std::string &)","cluster":"test_cluster_two_shards_localhost","shard":"1","rows":"1","bytes":"8"}
{"operation_name":"void DB::DistributedAsyncInsertDirectoryQueue::processFile(const std::string &)","cluster":"test_cluster_two_shards_localhost","shard":"2","rows":"1","bytes":"8"}
3
2
===3===
{"operation_name":"auto DB::DistributedSink::runWritingJob(DB::DistributedSink::JobReplica &, const DB::Block &, size_t)::(anonymous class)::operator()() const","cluster":"test_cluster_two_shards_localhost","shard":"1","rows":"1","bytes":"8"}
{"operation_name":"auto DB::DistributedSink::runWritingJob(DB::DistributedSink::JobReplica &, const DB::Block &, size_t)::(anonymous class)::operator()() const","cluster":"test_cluster_two_shards_localhost","shard":"2","rows":"1","bytes":"8"}
1
===4===
{"operation_name":"auto DB::DistributedSink::runWritingJob(DB::DistributedSink::JobReplica &, const DB::Block &, size_t)::(anonymous class)::operator()() const","cluster":"test_cluster_two_shards_localhost","shard":"1","rows":"1","bytes":"8"}
{"operation_name":"auto DB::DistributedSink::runWritingJob(DB::DistributedSink::JobReplica &, const DB::Block &, size_t)::(anonymous class)::operator()() const","cluster":"test_cluster_two_shards_localhost","shard":"2","rows":"1","bytes":"8"}
3
2

View File

@ -42,6 +42,22 @@ ${CLICKHOUSE_CLIENT} -nq "
;"
}
#
# $1 - OpenTelemetry Trace Id
# $2 - value of insert_distributed_sync
function check_span_kind()
{
${CLICKHOUSE_CLIENT} -nq "
SYSTEM FLUSH LOGS;
SELECT count()
FROM system.opentelemetry_span_log
WHERE finish_date >= yesterday()
AND lower(hex(trace_id)) = '${1}'
AND kind = '${2}'
;"
}
#
# Prepare tables for tests
@ -57,30 +73,46 @@ CREATE TABLE ${CLICKHOUSE_DATABASE}.local_opentelemetry (key UInt64) Engine=Merg
#
# test1
#
echo "===1==="
trace_id=$(${CLICKHOUSE_CLIENT} -q "select lower(hex(generateUUIDv4()))");
insert $trace_id 0 1 "async-insert-writeToLocal"
check_span $trace_id
# 1 HTTP SERVER spans
check_span_kind $trace_id 'SERVER'
#
# test2
#
echo "===2==="
trace_id=$(${CLICKHOUSE_CLIENT} -q "select lower(hex(generateUUIDv4()))");
insert $trace_id 0 0 "async-insert-writeToRemote"
check_span $trace_id
# 3 SERVER spans, 1 for HTTP, 2 for TCP
check_span_kind $trace_id 'SERVER'
# 2 CLIENT spans
check_span_kind $trace_id 'CLIENT'
#
# test3
#
trace_id=$(${CLICKHOUSE_CLIENT} -q "select lower(hex(generateUUIDv4()))");
insert $trace_id 1 1 "sync-insert-writeToLocal"
echo "===3==="
check_span $trace_id
# 1 HTTP SERVER spans
check_span_kind $trace_id 'SERVER'
#
# test4
#
echo "===4==="
trace_id=$(${CLICKHOUSE_CLIENT} -q "select lower(hex(generateUUIDv4()))");
insert $trace_id 1 0 "sync-insert-writeToRemote"
check_span $trace_id
# 3 SERVER spans, 1 for HTTP, 2 for TCP
check_span_kind $trace_id 'SERVER'
# 2 CLIENT spans
check_span_kind $trace_id 'CLIENT'
#
# Cleanup

View File

@ -144,8 +144,8 @@ else
# For Replicated database it will fail on initiator before enqueueing distributed DDL
expected=0
fi
check_span $expected $trace_id "%executeDDLQueryOnCluster%" "attribute['clickhouse.cluster']='${cluster_name}'"
check_span $expected $trace_id "%DDLWorker::processTask%"
check_span $expected $trace_id "%executeDDLQueryOnCluster%" "attribute['clickhouse.cluster']='${cluster_name}' AND kind = 'PRODUCER'"
check_span $expected $trace_id "%DDLWorker::processTask%" "kind = 'CONSUMER'"
if [ $cluster_name = "test_shard_localhost" ]; then
# There should be two 'query' spans, one is for the HTTPHandler, the other is for the DDL executing in DDLWorker.