This commit is contained in:
Alexander Kuzmenkov 2020-10-22 19:47:20 +03:00
parent 84908df6d8
commit 145e2b012f
14 changed files with 43 additions and 65 deletions

View File

@ -1,9 +1,17 @@
# [draft] OpenTelemetry support
---
toc_priority: 62
toc_title: [experimental] OpenTelemetry
---
# [experimental] OpenTelemetry Support
[OpenTelemetry](https://opentelemetry.io/) is an open standard for collecting
traces and metrics from distributed application. ClickHouse has some support
for OpenTelemetry.
!!! warning "Warning"
This is an experimental feature that will change in backwards-incompatible ways in the future releases.
## Supplying Trace Context to ClickHouse
@ -40,9 +48,9 @@ a dependency on a particular monitoring system, instead only
providing the tracing data conforming to the standard. A natural way to do so
in an SQL RDBMS is a system table. OpenTelemetry trace span information
[required by the standard](https://github.com/open-telemetry/opentelemetry-specification/blob/master/specification/overview.md#span)
is stored in the system table called `system.opentelemetry_log`.
is stored in the system table called `system.opentelemetry_span_log`.
The table must be enabled in the server configuration, see the `opentelemetry_log`
The table must be enabled in the server configuration, see the `opentelemetry_span_log`
element in the default config file `config.xml`. It is enabled by default.
The table has the following columns:

View File

@ -597,7 +597,7 @@
<!--
OpenTelemetry log contains OpenTelemetry trace spans.
-->
<opentelemetry_log>
<opentelemetry_span_log>
<!--
The default table creation code is insufficient, this <engine> spec
is a workaround. There is no 'event_time' for this log, but two times,
@ -614,9 +614,9 @@
order by (finish_date, finish_time_us, trace_id)
</engine>
<database>system</database>
<table>opentelemetry_log</table>
<table>opentelemetry_span_log</table>
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
</opentelemetry_log>
</opentelemetry_span_log>
<!-- Crash log. Stores stack traces for fatal errors.

View File

@ -133,8 +133,6 @@ void ClientInfo::read(ReadBuffer & in, const UInt64 client_protocol_revision)
client_version_patch = client_tcp_protocol_version;
}
// TODO what does it even mean to read this structure over HTTP? I thought
// this was for native protocol? See interface == Interface::HTTP.
if (client_protocol_revision >= DBMS_MIN_REVISION_WITH_OPENTELEMETRY)
{
uint8_t have_trace_id = 0;
@ -145,10 +143,6 @@ void ClientInfo::read(ReadBuffer & in, const UInt64 client_protocol_revision)
readBinary(opentelemetry_span_id, in);
readBinary(opentelemetry_tracestate, in);
readBinary(opentelemetry_trace_flags, in);
fmt::print(stderr, "read {:x}, {}, {} at\n{}\n",
opentelemetry_trace_id, opentelemetry_span_id,
opentelemetry_parent_span_id, StackTrace().toString());
}
}
}

View File

@ -59,15 +59,14 @@ public:
String initial_query_id;
Poco::Net::SocketAddress initial_address;
// OpenTelemetry things
// OpenTelemetry trace information.
__uint128_t opentelemetry_trace_id = 0;
// Span ID is not strictly the client info, but convenient to keep here.
// The span id we get the in the incoming client info becomes our parent span
// id, and the span id we send becomes downstream parent span id.
UInt64 opentelemetry_span_id = 0;
UInt64 opentelemetry_parent_span_id = 0;
// the incoming tracestate header, we just pass it downstream.
// https://www.w3.org/TR/trace-context/
// The incoming tracestate header and the trace flags, we just pass them downstream.
// They are described at https://www.w3.org/TR/trace-context/
String opentelemetry_tracestate;
UInt8 opentelemetry_trace_flags = 0;

View File

@ -1791,14 +1791,14 @@ std::shared_ptr<AsynchronousMetricLog> Context::getAsynchronousMetricLog()
}
std::shared_ptr<OpenTelemetrySpanLog> Context::getOpenTelemetryLog()
std::shared_ptr<OpenTelemetrySpanLog> Context::getOpenTelemetrySpanLog()
{
auto lock = getLock();
if (!shared->system_logs)
return {};
return shared->system_logs->opentelemetry_log;
return shared->system_logs->opentelemetry_span_log;
}

View File

@ -543,7 +543,7 @@ public:
std::shared_ptr<TextLog> getTextLog();
std::shared_ptr<MetricLog> getMetricLog();
std::shared_ptr<AsynchronousMetricLog> getAsynchronousMetricLog();
std::shared_ptr<OpenTelemetrySpanLog> getOpenTelemetryLog();
std::shared_ptr<OpenTelemetrySpanLog> getOpenTelemetrySpanLog();
/// Returns an object used to log operations with parts if it possible.
/// Provide table name to make required checks.

View File

@ -21,7 +21,7 @@
#include <Interpreters/TextLog.h>
#include <Interpreters/MetricLog.h>
#include <Interpreters/AsynchronousMetricLog.h>
#include <Interpreters/OpenTelemetryLog.h>
#include <Interpreters/OpenTelemetrySpanLog.h>
#include <Access/ContextAccess.h>
#include <Access/AllowedClientHosts.h>
#include <Databases/IDatabase.h>
@ -323,7 +323,7 @@ BlockIO InterpreterSystemQuery::execute()
[&] () { if (auto text_log = context.getTextLog()) text_log->flush(true); },
[&] () { if (auto metric_log = context.getMetricLog()) metric_log->flush(true); },
[&] () { if (auto asynchronous_metric_log = context.getAsynchronousMetricLog()) asynchronous_metric_log->flush(true); },
[&] () { if (auto opentelemetry_log = context.getOpenTelemetryLog()) opentelemetry_log->flush(true); }
[&] () { if (auto opentelemetry_span_log = context.getOpenTelemetrySpanLog()) opentelemetry_span_log->flush(true); }
);
break;
case Type::STOP_LISTEN_QUERIES:

View File

@ -1,4 +1,4 @@
#include "OpenTelemetryLog.h"
#include "OpenTelemetrySpanLog.h"
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeDate.h>

View File

@ -5,22 +5,6 @@
namespace DB
{
/*
struct OpenTelemetrySpanContext
{
UInt128 trace_id;
UInt64 span_id;
UInt8 trace_flags;
String trace_state;
};
*/
// using TimeMicroseconds = std::chrono::time_point<
// std::chrono::local_t,
// std::chrono::duration<UInt64, std::micro>>;
// TODO figure out precisely which part of this is run time, and which part we
// must log.
struct OpenTelemetrySpan
{
__uint128_t trace_id;
@ -35,13 +19,6 @@ struct OpenTelemetrySpan
// I don't understand how Links work, namely, which direction should they
// point to, and how they are related with parent_span_id, so no Links for
// now.
// The following fields look like something that is runtime only and doesn't
// require logging.
UInt8 trace_flags;
// Vendor-specific info, key-value pairs. Keep it as a string as described
// here: https://w3c.github.io/trace-context/#tracestate-header
String trace_state;
};
struct OpenTelemetrySpanLogElement : public OpenTelemetrySpan

View File

@ -7,7 +7,7 @@
#include <Interpreters/CrashLog.h>
#include <Interpreters/MetricLog.h>
#include <Interpreters/AsynchronousMetricLog.h>
#include <Interpreters/OpenTelemetryLog.h>
#include <Interpreters/OpenTelemetrySpanLog.h>
#include <Poco/Util/AbstractConfiguration.h>
#include <common/logger_useful.h>
@ -88,9 +88,9 @@ SystemLogs::SystemLogs(Context & global_context, const Poco::Util::AbstractConfi
asynchronous_metric_log = createSystemLog<AsynchronousMetricLog>(
global_context, "system", "asynchronous_metric_log", config,
"asynchronous_metric_log");
opentelemetry_log = createSystemLog<OpenTelemetrySpanLog>(
global_context, "system", "opentelemetry_log", config,
"opentelemetry_log");
opentelemetry_span_log = createSystemLog<OpenTelemetrySpanLog>(
global_context, "system", "opentelemetry_span_log", config,
"opentelemetry_span_log");
if (query_log)
logs.emplace_back(query_log.get());
@ -108,8 +108,8 @@ SystemLogs::SystemLogs(Context & global_context, const Poco::Util::AbstractConfi
logs.emplace_back(metric_log.get());
if (asynchronous_metric_log)
logs.emplace_back(asynchronous_metric_log.get());
if (opentelemetry_log)
logs.emplace_back(opentelemetry_log.get());
if (opentelemetry_span_log)
logs.emplace_back(opentelemetry_span_log.get());
try
{

View File

@ -106,8 +106,8 @@ struct SystemLogs
std::shared_ptr<MetricLog> metric_log; /// Used to log all metrics.
/// Metrics from system.asynchronous_metrics.
std::shared_ptr<AsynchronousMetricLog> asynchronous_metric_log;
/// OpenTelemetry trace spans
std::shared_ptr<OpenTelemetrySpanLog> opentelemetry_log;
/// OpenTelemetry trace spans.
std::shared_ptr<OpenTelemetrySpanLog> opentelemetry_span_log;
std::vector<ISystemLog *> logs;
};

View File

@ -31,7 +31,7 @@
#include <Access/EnabledQuota.h>
#include <Interpreters/InterpreterFactory.h>
#include <Interpreters/ProcessList.h>
#include <Interpreters/OpenTelemetryLog.h>
#include <Interpreters/OpenTelemetrySpanLog.h>
#include <Interpreters/QueryLog.h>
#include <Interpreters/InterpreterSetQuery.h>
#include <Interpreters/ApplyWithGlobalVisitor.h>
@ -245,9 +245,9 @@ static void onExceptionBeforeStart(const String & query_for_logging, Context & c
if (auto query_log = context.getQueryLog())
query_log->add(elem);
if (auto opentelemetry_log = context.getOpenTelemetryLog();
if (auto opentelemetry_span_log = context.getOpenTelemetrySpanLog();
context.getClientInfo().opentelemetry_trace_id
&& opentelemetry_log)
&& opentelemetry_span_log)
{
OpenTelemetrySpanLogElement span;
span.trace_id = context.getClientInfo().opentelemetry_trace_id;
@ -275,7 +275,7 @@ static void onExceptionBeforeStart(const String & query_for_logging, Context & c
context.getClientInfo().opentelemetry_tracestate);
}
opentelemetry_log->add(span);
opentelemetry_span_log->add(span);
}
ProfileEvents::increment(ProfileEvents::FailedQuery);
@ -663,9 +663,9 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
query_log->add(elem);
}
if (auto opentelemetry_log = context.getOpenTelemetryLog();
if (auto opentelemetry_span_log = context.getOpenTelemetrySpanLog();
context.getClientInfo().opentelemetry_trace_id
&& opentelemetry_log)
&& opentelemetry_span_log)
{
OpenTelemetrySpanLogElement span;
span.trace_id = context.getClientInfo().opentelemetry_trace_id;
@ -692,7 +692,7 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
context.getClientInfo().opentelemetry_tracestate);
}
opentelemetry_log->add(span);
opentelemetry_span_log->add(span);
}
};

View File

@ -119,7 +119,7 @@ SRCS(
MutationsInterpreter.cpp
MySQL/InterpretersMySQLDDLQuery.cpp
NullableUtils.cpp
OpenTelemetryLog.cpp
OpenTelemetrySpanLog.cpp
OptimizeIfChains.cpp
OptimizeIfWithConstantConditionVisitor.cpp
PartLog.cpp

View File

@ -15,7 +15,7 @@ select count(*) "'"'"total spans"'"'",
uniqExact(span_id) "'"'"unique spans"'"'",
uniqExactIf(parent_span_id, parent_span_id != 0)
"'"'"unique non-zero parent spans"'"'"
from system.opentelemetry_log
from system.opentelemetry_span_log
where trace_id = reinterpretAsUUID(reverse(unhex('$trace_id')))
and operation_name = 'query'
;
@ -24,7 +24,7 @@ select count(*) "'"'"total spans"'"'",
select count(*) "'"'"initial query spans with proper parent"'"'"
from
(select *, attribute_name, attribute_value
from system.opentelemetry_log
from system.opentelemetry_span_log
array join attribute.names as attribute_name,
attribute.values as attribute_value) o
join system.query_log on query_id = o.attribute_value
@ -39,7 +39,7 @@ select count(*) "'"'"initial query spans with proper parent"'"'"
-- Check that the tracestate header was propagated. It must have exactly the
-- same non-empty value for all 'query' spans in this trace.
select uniqExact(value) "'"'"unique non-empty tracestate values"'"'"
from system.opentelemetry_log
from system.opentelemetry_span_log
array join attribute.names as name, attribute.values as value
where
trace_id = reinterpretAsUUID(reverse(unhex('$trace_id')))
@ -106,7 +106,7 @@ ${CLICKHOUSE_CLIENT} -q "
with count(*) as c
-- expect 200 * 0.1 = 20 sampled events on average
select if(c > 5 and c < 35, 'OK', 'fail: ' || toString(c))
from system.opentelemetry_log
from system.opentelemetry_span_log
array join attribute.names as name, attribute.values as value
where name = 'clickhouse.query_id'
and operation_name = 'query'