Merge pull request #24416 from elevankoff/system-metrics

Common system metrics collection
This commit is contained in:
alexey-milovidov 2021-07-07 03:21:49 +03:00 committed by GitHub
commit 80eaf85301
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 1066 additions and 108 deletions

View File

@ -1178,11 +1178,11 @@ create view right_async_metric_log as
-- Use the right log as time reference because it may have higher precision.
create table metrics engine File(TSV, 'metrics/metrics.tsv') as
with (select min(event_time) from right_async_metric_log) as min_time
select name metric, r.event_time - min_time event_time, l.value as left, r.value as right
select metric, r.event_time - min_time event_time, l.value as left, r.value as right
from right_async_metric_log r
asof join file('left-async-metric-log.tsv', TSVWithNamesAndTypes,
'$(cat left-async-metric-log.tsv.columns)') l
on l.name = r.name and r.event_time <= l.event_time
on l.metric = r.metric and r.event_time <= l.event_time
order by metric, event_time
;

View File

@ -1159,7 +1159,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
{
/// This object will periodically calculate some metrics.
AsynchronousMetrics async_metrics(
global_context, config().getUInt("asynchronous_metrics_update_period_s", 60), servers_to_start_before_tables, servers);
global_context, config().getUInt("asynchronous_metrics_update_period_s", 1), servers_to_start_before_tables, servers);
attachSystemTablesAsync(*DatabaseCatalog::instance().getSystemDatabase(), async_metrics);
for (const auto & listen_host : listen_hosts)

View File

@ -583,7 +583,7 @@
<port>9019</port>
</jdbc_bridge>
-->
<!-- Configuration of clusters that could be used in Distributed tables.
https://clickhouse.tech/docs/en/operations/table_engines/distributed/
-->
@ -917,7 +917,7 @@
Asynchronous metrics are updated once a minute, so there is
no need to flush more often.
-->
<flush_interval_milliseconds>60000</flush_interval_milliseconds>
<flush_interval_milliseconds>7000</flush_interval_milliseconds>
</asynchronous_metric_log>
<!--

View File

@ -557,6 +557,7 @@
M(587, CONCURRENT_ACCESS_NOT_SUPPORTED) \
M(588, DISTRIBUTED_BROKEN_BATCH_INFO) \
M(589, DISTRIBUTED_BROKEN_BATCH_FILES) \
M(590, CANNOT_SYSCONF) \
\
M(998, POSTGRESQL_CONNECTION_FAILURE) \
M(999, KEEPER_EXCEPTION) \

View File

@ -149,7 +149,7 @@ off_t ReadBufferFromFileDescriptor::seek(off_t offset, int whence)
off_t res = ::lseek(fd, new_pos, SEEK_SET);
if (-1 == res)
throwFromErrnoWithPath("Cannot seek through file " + getFileName(), getFileName(),
ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
file_offset_of_buffer_end = new_pos;
watch.stop();
@ -160,6 +160,20 @@ off_t ReadBufferFromFileDescriptor::seek(off_t offset, int whence)
}
void ReadBufferFromFileDescriptor::rewind()
{
ProfileEvents::increment(ProfileEvents::Seek);
off_t res = ::lseek(fd, 0, SEEK_SET);
if (-1 == res)
throwFromErrnoWithPath("Cannot seek through file " + getFileName(), getFileName(),
ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
/// Clearing the buffer with existing data. New data will be read on subsequent call to 'next'.
working_buffer.resize(0);
pos = working_buffer.begin();
}
/// Assuming file descriptor supports 'select', check that we have data to read or wait until timeout.
bool ReadBufferFromFileDescriptor::poll(size_t timeout_microseconds)
{

View File

@ -39,6 +39,9 @@ public:
/// If 'offset' is small enough to stay in buffer after seek, then true seek in file does not happen.
off_t seek(off_t off, int whence) override;
/// Seek to the beginning, discarding already read data if any. Useful to reread file that changes on every read.
void rewind();
off_t size();
void setProgressCallback(ContextPtr context);

View File

@ -18,7 +18,7 @@ NamesAndTypesList AsynchronousMetricLogElement::getNamesAndTypes()
{"event_date", std::make_shared<DataTypeDate>()},
{"event_time", std::make_shared<DataTypeDateTime>()},
{"event_time_microseconds", std::make_shared<DataTypeDateTime64>(6)},
{"name", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())},
{"metric", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())},
{"value", std::make_shared<DataTypeFloat64>(),}
};
}

File diff suppressed because it is too large Load Diff

View File

@ -3,11 +3,15 @@
#include <Interpreters/Context_fwd.h>
#include <Common/MemoryStatisticsOS.h>
#include <Common/ThreadPool.h>
#include <IO/ReadBufferFromFile.h>
#include <condition_variable>
#include <map>
#include <mutex>
#include <string>
#include <thread>
#include <vector>
#include <optional>
#include <unordered_map>
@ -15,6 +19,7 @@ namespace DB
{
class ProtocolServerAdapter;
class ReadBuffer;
using AsynchronousMetricValue = double;
using AsynchronousMetricValues = std::unordered_map<std::string, AsynchronousMetricValue>;
@ -23,10 +28,30 @@ using AsynchronousMetricValues = std::unordered_map<std::string, AsynchronousMet
/** Periodically (by default, each minute, starting at 30 seconds offset)
* calculates and updates some metrics,
* that are not updated automatically (so, need to be asynchronously calculated).
*
* This includes both ClickHouse-related metrics (like memory usage of ClickHouse process)
* and common OS-related metrics (like total memory usage on the server).
*/
class AsynchronousMetrics : WithContext
{
public:
/// The default value of update_period_seconds is for ClickHouse-over-YT
/// in Arcadia -- it uses its own server implementation that also uses these
/// metrics.
AsynchronousMetrics(
ContextPtr global_context_,
int update_period_seconds,
std::shared_ptr<std::vector<ProtocolServerAdapter>> servers_to_start_before_tables_,
std::shared_ptr<std::vector<ProtocolServerAdapter>> servers_);
~AsynchronousMetrics();
/// Separate method allows to initialize the `servers` variable beforehand.
void start();
/// Returns copy of all values.
AsynchronousMetricValues getValues() const;
#if defined(ARCADIA_BUILD)
/// This constructor needs only to provide backward compatibility with some other projects (hello, Arcadia).
/// Never use this in the ClickHouse codebase.
@ -39,35 +64,6 @@ public:
}
#endif
/// The default value of update_period_seconds is for ClickHouse-over-YT
/// in Arcadia -- it uses its own server implementation that also uses these
/// metrics.
AsynchronousMetrics(
ContextPtr global_context_,
int update_period_seconds,
std::shared_ptr<std::vector<ProtocolServerAdapter>> servers_to_start_before_tables_,
std::shared_ptr<std::vector<ProtocolServerAdapter>> servers_)
: WithContext(global_context_)
, update_period(update_period_seconds)
, servers_to_start_before_tables(servers_to_start_before_tables_)
, servers(servers_)
{
}
~AsynchronousMetrics();
/// Separate method allows to initialize the `servers` variable beforehand.
void start()
{
/// Update once right now, to make metrics available just after server start
/// (without waiting for asynchronous_metrics_update_period_s).
update();
thread = std::make_unique<ThreadFromGlobalPool>([this] { run(); });
}
/// Returns copy of all values.
AsynchronousMetricValues getValues() const;
private:
const std::chrono::seconds update_period;
std::shared_ptr<std::vector<ProtocolServerAdapter>> servers_to_start_before_tables{nullptr};
@ -78,14 +74,113 @@ private:
bool quit {false};
AsynchronousMetricValues values;
/// Some values are incremental and we have to calculate the difference.
/// On first run we will only collect the values to subtract later.
bool first_run = true;
std::chrono::system_clock::time_point previous_update_time;
#if defined(OS_LINUX)
MemoryStatisticsOS memory_stat;
std::optional<ReadBufferFromFile> meminfo;
std::optional<ReadBufferFromFile> loadavg;
std::optional<ReadBufferFromFile> proc_stat;
std::optional<ReadBufferFromFile> cpuinfo;
std::optional<ReadBufferFromFile> file_nr;
std::optional<ReadBufferFromFile> uptime;
std::optional<ReadBufferFromFile> net_dev;
std::vector<std::unique_ptr<ReadBufferFromFile>> thermal;
std::unordered_map<String /* device name */,
std::unordered_map<String /* label name */,
std::unique_ptr<ReadBufferFromFile>>> hwmon_devices;
std::vector<std::pair<
std::unique_ptr<ReadBufferFromFile> /* correctable errors */,
std::unique_ptr<ReadBufferFromFile> /* uncorrectable errors */>> edac;
std::unordered_map<String /* device name */, std::unique_ptr<ReadBufferFromFile>> block_devs;
/// TODO: socket statistics.
struct ProcStatValuesCPU
{
uint64_t user;
uint64_t nice;
uint64_t system;
uint64_t idle;
uint64_t iowait;
uint64_t irq;
uint64_t softirq;
uint64_t steal;
uint64_t guest;
uint64_t guest_nice;
void read(ReadBuffer & in);
ProcStatValuesCPU operator-(const ProcStatValuesCPU & other) const;
};
struct ProcStatValuesOther
{
uint64_t interrupts;
uint64_t context_switches;
uint64_t processes_created;
ProcStatValuesOther operator-(const ProcStatValuesOther & other) const;
};
ProcStatValuesCPU proc_stat_values_all_cpus{};
ProcStatValuesOther proc_stat_values_other{};
std::vector<ProcStatValuesCPU> proc_stat_values_per_cpu;
/// https://www.kernel.org/doc/Documentation/block/stat.txt
struct BlockDeviceStatValues
{
uint64_t read_ios;
uint64_t read_merges;
uint64_t read_sectors;
uint64_t read_ticks;
uint64_t write_ios;
uint64_t write_merges;
uint64_t write_sectors;
uint64_t write_ticks;
uint64_t in_flight_ios;
uint64_t io_ticks;
uint64_t time_in_queue;
uint64_t discard_ops;
uint64_t discard_merges;
uint64_t discard_sectors;
uint64_t discard_ticks;
void read(ReadBuffer & in);
BlockDeviceStatValues operator-(const BlockDeviceStatValues & other) const;
};
std::unordered_map<String /* device name */, BlockDeviceStatValues> block_device_stats;
struct NetworkInterfaceStatValues
{
uint64_t recv_bytes;
uint64_t recv_packets;
uint64_t recv_errors;
uint64_t recv_drop;
uint64_t send_bytes;
uint64_t send_packets;
uint64_t send_errors;
uint64_t send_drop;
NetworkInterfaceStatValues operator-(const NetworkInterfaceStatValues & other) const;
};
std::unordered_map<String /* device name */, NetworkInterfaceStatValues> network_interface_stats;
#endif
std::unique_ptr<ThreadFromGlobalPool> thread;
void run();
void update();
void update(std::chrono::system_clock::time_point update_time);
};
}