mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 15:12:02 +00:00
Merge pull request #24416 from elevankoff/system-metrics
Common system metrics collection
This commit is contained in:
commit
80eaf85301
@ -1178,11 +1178,11 @@ create view right_async_metric_log as
|
||||
-- Use the right log as time reference because it may have higher precision.
|
||||
create table metrics engine File(TSV, 'metrics/metrics.tsv') as
|
||||
with (select min(event_time) from right_async_metric_log) as min_time
|
||||
select name metric, r.event_time - min_time event_time, l.value as left, r.value as right
|
||||
select metric, r.event_time - min_time event_time, l.value as left, r.value as right
|
||||
from right_async_metric_log r
|
||||
asof join file('left-async-metric-log.tsv', TSVWithNamesAndTypes,
|
||||
'$(cat left-async-metric-log.tsv.columns)') l
|
||||
on l.name = r.name and r.event_time <= l.event_time
|
||||
on l.metric = r.metric and r.event_time <= l.event_time
|
||||
order by metric, event_time
|
||||
;
|
||||
|
||||
|
@ -1159,7 +1159,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
|
||||
{
|
||||
/// This object will periodically calculate some metrics.
|
||||
AsynchronousMetrics async_metrics(
|
||||
global_context, config().getUInt("asynchronous_metrics_update_period_s", 60), servers_to_start_before_tables, servers);
|
||||
global_context, config().getUInt("asynchronous_metrics_update_period_s", 1), servers_to_start_before_tables, servers);
|
||||
attachSystemTablesAsync(*DatabaseCatalog::instance().getSystemDatabase(), async_metrics);
|
||||
|
||||
for (const auto & listen_host : listen_hosts)
|
||||
|
@ -583,7 +583,7 @@
|
||||
<port>9019</port>
|
||||
</jdbc_bridge>
|
||||
-->
|
||||
|
||||
|
||||
<!-- Configuration of clusters that could be used in Distributed tables.
|
||||
https://clickhouse.tech/docs/en/operations/table_engines/distributed/
|
||||
-->
|
||||
@ -917,7 +917,7 @@
|
||||
Asynchronous metrics are updated once a minute, so there is
|
||||
no need to flush more often.
|
||||
-->
|
||||
<flush_interval_milliseconds>60000</flush_interval_milliseconds>
|
||||
<flush_interval_milliseconds>7000</flush_interval_milliseconds>
|
||||
</asynchronous_metric_log>
|
||||
|
||||
<!--
|
||||
|
@ -557,6 +557,7 @@
|
||||
M(587, CONCURRENT_ACCESS_NOT_SUPPORTED) \
|
||||
M(588, DISTRIBUTED_BROKEN_BATCH_INFO) \
|
||||
M(589, DISTRIBUTED_BROKEN_BATCH_FILES) \
|
||||
M(590, CANNOT_SYSCONF) \
|
||||
\
|
||||
M(998, POSTGRESQL_CONNECTION_FAILURE) \
|
||||
M(999, KEEPER_EXCEPTION) \
|
||||
|
@ -149,7 +149,7 @@ off_t ReadBufferFromFileDescriptor::seek(off_t offset, int whence)
|
||||
off_t res = ::lseek(fd, new_pos, SEEK_SET);
|
||||
if (-1 == res)
|
||||
throwFromErrnoWithPath("Cannot seek through file " + getFileName(), getFileName(),
|
||||
ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
|
||||
ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
|
||||
file_offset_of_buffer_end = new_pos;
|
||||
|
||||
watch.stop();
|
||||
@ -160,6 +160,20 @@ off_t ReadBufferFromFileDescriptor::seek(off_t offset, int whence)
|
||||
}
|
||||
|
||||
|
||||
void ReadBufferFromFileDescriptor::rewind()
|
||||
{
|
||||
ProfileEvents::increment(ProfileEvents::Seek);
|
||||
off_t res = ::lseek(fd, 0, SEEK_SET);
|
||||
if (-1 == res)
|
||||
throwFromErrnoWithPath("Cannot seek through file " + getFileName(), getFileName(),
|
||||
ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
|
||||
|
||||
/// Clearing the buffer with existing data. New data will be read on subsequent call to 'next'.
|
||||
working_buffer.resize(0);
|
||||
pos = working_buffer.begin();
|
||||
}
|
||||
|
||||
|
||||
/// Assuming file descriptor supports 'select', check that we have data to read or wait until timeout.
|
||||
bool ReadBufferFromFileDescriptor::poll(size_t timeout_microseconds)
|
||||
{
|
||||
|
@ -39,6 +39,9 @@ public:
|
||||
/// If 'offset' is small enough to stay in buffer after seek, then true seek in file does not happen.
|
||||
off_t seek(off_t off, int whence) override;
|
||||
|
||||
/// Seek to the beginning, discarding already read data if any. Useful to reread file that changes on every read.
|
||||
void rewind();
|
||||
|
||||
off_t size();
|
||||
|
||||
void setProgressCallback(ContextPtr context);
|
||||
|
@ -18,7 +18,7 @@ NamesAndTypesList AsynchronousMetricLogElement::getNamesAndTypes()
|
||||
{"event_date", std::make_shared<DataTypeDate>()},
|
||||
{"event_time", std::make_shared<DataTypeDateTime>()},
|
||||
{"event_time_microseconds", std::make_shared<DataTypeDateTime64>(6)},
|
||||
{"name", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())},
|
||||
{"metric", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())},
|
||||
{"value", std::make_shared<DataTypeFloat64>(),}
|
||||
};
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -3,11 +3,15 @@
|
||||
#include <Interpreters/Context_fwd.h>
|
||||
#include <Common/MemoryStatisticsOS.h>
|
||||
#include <Common/ThreadPool.h>
|
||||
#include <IO/ReadBufferFromFile.h>
|
||||
|
||||
#include <condition_variable>
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <optional>
|
||||
#include <unordered_map>
|
||||
|
||||
|
||||
@ -15,6 +19,7 @@ namespace DB
|
||||
{
|
||||
|
||||
class ProtocolServerAdapter;
|
||||
class ReadBuffer;
|
||||
|
||||
using AsynchronousMetricValue = double;
|
||||
using AsynchronousMetricValues = std::unordered_map<std::string, AsynchronousMetricValue>;
|
||||
@ -23,10 +28,30 @@ using AsynchronousMetricValues = std::unordered_map<std::string, AsynchronousMet
|
||||
/** Periodically (by default, each minute, starting at 30 seconds offset)
|
||||
* calculates and updates some metrics,
|
||||
* that are not updated automatically (so, need to be asynchronously calculated).
|
||||
*
|
||||
* This includes both ClickHouse-related metrics (like memory usage of ClickHouse process)
|
||||
* and common OS-related metrics (like total memory usage on the server).
|
||||
*/
|
||||
class AsynchronousMetrics : WithContext
|
||||
{
|
||||
public:
|
||||
/// The default value of update_period_seconds is for ClickHouse-over-YT
|
||||
/// in Arcadia -- it uses its own server implementation that also uses these
|
||||
/// metrics.
|
||||
AsynchronousMetrics(
|
||||
ContextPtr global_context_,
|
||||
int update_period_seconds,
|
||||
std::shared_ptr<std::vector<ProtocolServerAdapter>> servers_to_start_before_tables_,
|
||||
std::shared_ptr<std::vector<ProtocolServerAdapter>> servers_);
|
||||
|
||||
~AsynchronousMetrics();
|
||||
|
||||
/// Separate method allows to initialize the `servers` variable beforehand.
|
||||
void start();
|
||||
|
||||
/// Returns copy of all values.
|
||||
AsynchronousMetricValues getValues() const;
|
||||
|
||||
#if defined(ARCADIA_BUILD)
|
||||
/// This constructor needs only to provide backward compatibility with some other projects (hello, Arcadia).
|
||||
/// Never use this in the ClickHouse codebase.
|
||||
@ -39,35 +64,6 @@ public:
|
||||
}
|
||||
#endif
|
||||
|
||||
/// The default value of update_period_seconds is for ClickHouse-over-YT
|
||||
/// in Arcadia -- it uses its own server implementation that also uses these
|
||||
/// metrics.
|
||||
AsynchronousMetrics(
|
||||
ContextPtr global_context_,
|
||||
int update_period_seconds,
|
||||
std::shared_ptr<std::vector<ProtocolServerAdapter>> servers_to_start_before_tables_,
|
||||
std::shared_ptr<std::vector<ProtocolServerAdapter>> servers_)
|
||||
: WithContext(global_context_)
|
||||
, update_period(update_period_seconds)
|
||||
, servers_to_start_before_tables(servers_to_start_before_tables_)
|
||||
, servers(servers_)
|
||||
{
|
||||
}
|
||||
|
||||
~AsynchronousMetrics();
|
||||
|
||||
/// Separate method allows to initialize the `servers` variable beforehand.
|
||||
void start()
|
||||
{
|
||||
/// Update once right now, to make metrics available just after server start
|
||||
/// (without waiting for asynchronous_metrics_update_period_s).
|
||||
update();
|
||||
thread = std::make_unique<ThreadFromGlobalPool>([this] { run(); });
|
||||
}
|
||||
|
||||
/// Returns copy of all values.
|
||||
AsynchronousMetricValues getValues() const;
|
||||
|
||||
private:
|
||||
const std::chrono::seconds update_period;
|
||||
std::shared_ptr<std::vector<ProtocolServerAdapter>> servers_to_start_before_tables{nullptr};
|
||||
@ -78,14 +74,113 @@ private:
|
||||
bool quit {false};
|
||||
AsynchronousMetricValues values;
|
||||
|
||||
/// Some values are incremental and we have to calculate the difference.
|
||||
/// On first run we will only collect the values to subtract later.
|
||||
bool first_run = true;
|
||||
std::chrono::system_clock::time_point previous_update_time;
|
||||
|
||||
#if defined(OS_LINUX)
|
||||
MemoryStatisticsOS memory_stat;
|
||||
|
||||
std::optional<ReadBufferFromFile> meminfo;
|
||||
std::optional<ReadBufferFromFile> loadavg;
|
||||
std::optional<ReadBufferFromFile> proc_stat;
|
||||
std::optional<ReadBufferFromFile> cpuinfo;
|
||||
std::optional<ReadBufferFromFile> file_nr;
|
||||
std::optional<ReadBufferFromFile> uptime;
|
||||
std::optional<ReadBufferFromFile> net_dev;
|
||||
|
||||
std::vector<std::unique_ptr<ReadBufferFromFile>> thermal;
|
||||
|
||||
std::unordered_map<String /* device name */,
|
||||
std::unordered_map<String /* label name */,
|
||||
std::unique_ptr<ReadBufferFromFile>>> hwmon_devices;
|
||||
|
||||
std::vector<std::pair<
|
||||
std::unique_ptr<ReadBufferFromFile> /* correctable errors */,
|
||||
std::unique_ptr<ReadBufferFromFile> /* uncorrectable errors */>> edac;
|
||||
|
||||
std::unordered_map<String /* device name */, std::unique_ptr<ReadBufferFromFile>> block_devs;
|
||||
|
||||
/// TODO: socket statistics.
|
||||
|
||||
struct ProcStatValuesCPU
|
||||
{
|
||||
uint64_t user;
|
||||
uint64_t nice;
|
||||
uint64_t system;
|
||||
uint64_t idle;
|
||||
uint64_t iowait;
|
||||
uint64_t irq;
|
||||
uint64_t softirq;
|
||||
uint64_t steal;
|
||||
uint64_t guest;
|
||||
uint64_t guest_nice;
|
||||
|
||||
void read(ReadBuffer & in);
|
||||
ProcStatValuesCPU operator-(const ProcStatValuesCPU & other) const;
|
||||
};
|
||||
|
||||
struct ProcStatValuesOther
|
||||
{
|
||||
uint64_t interrupts;
|
||||
uint64_t context_switches;
|
||||
uint64_t processes_created;
|
||||
|
||||
ProcStatValuesOther operator-(const ProcStatValuesOther & other) const;
|
||||
};
|
||||
|
||||
ProcStatValuesCPU proc_stat_values_all_cpus{};
|
||||
ProcStatValuesOther proc_stat_values_other{};
|
||||
std::vector<ProcStatValuesCPU> proc_stat_values_per_cpu;
|
||||
|
||||
/// https://www.kernel.org/doc/Documentation/block/stat.txt
|
||||
struct BlockDeviceStatValues
|
||||
{
|
||||
uint64_t read_ios;
|
||||
uint64_t read_merges;
|
||||
uint64_t read_sectors;
|
||||
uint64_t read_ticks;
|
||||
uint64_t write_ios;
|
||||
uint64_t write_merges;
|
||||
uint64_t write_sectors;
|
||||
uint64_t write_ticks;
|
||||
uint64_t in_flight_ios;
|
||||
uint64_t io_ticks;
|
||||
uint64_t time_in_queue;
|
||||
uint64_t discard_ops;
|
||||
uint64_t discard_merges;
|
||||
uint64_t discard_sectors;
|
||||
uint64_t discard_ticks;
|
||||
|
||||
void read(ReadBuffer & in);
|
||||
BlockDeviceStatValues operator-(const BlockDeviceStatValues & other) const;
|
||||
};
|
||||
|
||||
std::unordered_map<String /* device name */, BlockDeviceStatValues> block_device_stats;
|
||||
|
||||
struct NetworkInterfaceStatValues
|
||||
{
|
||||
uint64_t recv_bytes;
|
||||
uint64_t recv_packets;
|
||||
uint64_t recv_errors;
|
||||
uint64_t recv_drop;
|
||||
uint64_t send_bytes;
|
||||
uint64_t send_packets;
|
||||
uint64_t send_errors;
|
||||
uint64_t send_drop;
|
||||
|
||||
NetworkInterfaceStatValues operator-(const NetworkInterfaceStatValues & other) const;
|
||||
};
|
||||
|
||||
std::unordered_map<String /* device name */, NetworkInterfaceStatValues> network_interface_stats;
|
||||
|
||||
#endif
|
||||
|
||||
std::unique_ptr<ThreadFromGlobalPool> thread;
|
||||
|
||||
void run();
|
||||
void update();
|
||||
void update(std::chrono::system_clock::time_point update_time);
|
||||
};
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user