Merge pull request #62003 from nickitat/cpu_cgroup_aware

Fix cpu metric collection in cgroups
This commit is contained in:
Nikita Taranov 2024-08-14 11:08:01 +00:00 committed by GitHub
commit 3c5d588302
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 360 additions and 143 deletions

View File

@ -27,27 +27,6 @@ bool cgroupsV2Enabled()
#endif #endif
} }
bool cgroupsV2MemoryControllerEnabled()
{
#if defined(OS_LINUX)
chassert(cgroupsV2Enabled());
/// According to https://docs.kernel.org/admin-guide/cgroup-v2.html, file "cgroup.controllers" defines which controllers are available
/// for the current + child cgroups. The set of available controllers can be restricted from level to level using file
/// "cgroups.subtree_control". It is therefore sufficient to check the bottom-most nested "cgroup.controllers" file.
fs::path cgroup_dir = cgroupV2PathOfProcess();
if (cgroup_dir.empty())
return false;
std::ifstream controllers_file(cgroup_dir / "cgroup.controllers");
if (!controllers_file.is_open())
return false;
std::string controllers;
std::getline(controllers_file, controllers);
return controllers.find("memory") != std::string::npos;
#else
return false;
#endif
}
fs::path cgroupV2PathOfProcess() fs::path cgroupV2PathOfProcess()
{ {
#if defined(OS_LINUX) #if defined(OS_LINUX)
@ -71,3 +50,28 @@ fs::path cgroupV2PathOfProcess()
return {}; return {};
#endif #endif
} }
std::optional<std::string> getCgroupsV2PathContainingFile([[maybe_unused]] std::string_view file_name)
{
#if defined(OS_LINUX)
if (!cgroupsV2Enabled())
return {};
fs::path current_cgroup = cgroupV2PathOfProcess();
if (current_cgroup.empty())
return {};
/// Return the bottom-most nested file. If there is no such file at the current
/// level, try again at the parent level as settings are inherited.
while (current_cgroup != default_cgroups_mount.parent_path())
{
const auto path = current_cgroup / file_name;
if (fs::exists(path))
return {current_cgroup};
current_cgroup = current_cgroup.parent_path();
}
return {};
#else
return {};
#endif
}

View File

@ -1,6 +1,7 @@
#pragma once #pragma once
#include <filesystem> #include <filesystem>
#include <string_view>
#if defined(OS_LINUX) #if defined(OS_LINUX)
/// I think it is possible to mount the cgroups hierarchy somewhere else (e.g. when in containers). /// I think it is possible to mount the cgroups hierarchy somewhere else (e.g. when in containers).
@ -11,11 +12,11 @@ static inline const std::filesystem::path default_cgroups_mount = "/sys/fs/cgrou
/// Is cgroups v2 enabled on the system? /// Is cgroups v2 enabled on the system?
bool cgroupsV2Enabled(); bool cgroupsV2Enabled();
/// Is the memory controller of cgroups v2 enabled on the system?
/// Assumes that cgroupsV2Enabled() is enabled.
bool cgroupsV2MemoryControllerEnabled();
/// Detects which cgroup v2 the process belongs to and returns the filesystem path to the cgroup. /// Detects which cgroup v2 the process belongs to and returns the filesystem path to the cgroup.
/// Returns an empty path the cgroup cannot be determined. /// Returns an empty path the cgroup cannot be determined.
/// Assumes that cgroupsV2Enabled() is enabled. /// Assumes that cgroupsV2Enabled() is enabled.
std::filesystem::path cgroupV2PathOfProcess(); std::filesystem::path cgroupV2PathOfProcess();
/// Returns the most nested cgroup dir containing the specified file.
/// If cgroups v2 is not enabled - returns an empty optional.
std::optional<std::string> getCgroupsV2PathContainingFile([[maybe_unused]] std::string_view file_name);

View File

@ -19,9 +19,6 @@ std::optional<uint64_t> getCgroupsV2MemoryLimit()
if (!cgroupsV2Enabled()) if (!cgroupsV2Enabled())
return {}; return {};
if (!cgroupsV2MemoryControllerEnabled())
return {};
std::filesystem::path current_cgroup = cgroupV2PathOfProcess(); std::filesystem::path current_cgroup = cgroupV2PathOfProcess();
if (current_cgroup.empty()) if (current_cgroup.empty())
return {}; return {};

View File

@ -1,18 +1,24 @@
#include <Common/formatReadable.h>
#include <Common/AsynchronousMetrics.h> #include <Common/AsynchronousMetrics.h>
#include <Common/Exception.h>
#include <Common/setThreadName.h>
#include <Common/CurrentMetrics.h>
#include <Common/filesystemHelpers.h>
#include <Common/logger_useful.h>
#include <IO/UncompressedCache.h>
#include <IO/MMappedFileCache.h> #include <IO/MMappedFileCache.h>
#include <IO/ReadHelpers.h> #include <IO/ReadHelpers.h>
#include <IO/UncompressedCache.h>
#include <base/cgroupsv2.h>
#include <base/errnoToString.h> #include <base/errnoToString.h>
#include <base/find_symbols.h> #include <base/find_symbols.h>
#include <base/getPageSize.h> #include <base/getPageSize.h>
#include <sys/resource.h> #include <sys/resource.h>
#include <Common/CurrentMetrics.h>
#include <Common/Exception.h>
#include <Common/filesystemHelpers.h>
#include <Common/formatReadable.h>
#include <Common/logger_useful.h>
#include <Common/setThreadName.h>
#include <boost/locale/date_time_facet.hpp>
#include <chrono> #include <chrono>
#include <string_view>
#include "config.h" #include "config.h"
@ -52,6 +58,12 @@ static std::unique_ptr<ReadBufferFromFilePRead> openFileIfExists(const std::stri
return {}; return {};
} }
static void openCgroupv2MetricFile(const std::string & filename, std::optional<ReadBufferFromFilePRead> & out)
{
if (auto path = getCgroupsV2PathContainingFile(filename))
openFileIfExists((path.value() + filename).c_str(), out);
};
#endif #endif
@ -63,21 +75,15 @@ AsynchronousMetrics::AsynchronousMetrics(
, protocol_server_metrics_func(protocol_server_metrics_func_) , protocol_server_metrics_func(protocol_server_metrics_func_)
{ {
#if defined(OS_LINUX) #if defined(OS_LINUX)
openFileIfExists("/proc/meminfo", meminfo);
openFileIfExists("/proc/loadavg", loadavg);
openFileIfExists("/proc/stat", proc_stat);
openFileIfExists("/proc/cpuinfo", cpuinfo); openFileIfExists("/proc/cpuinfo", cpuinfo);
openFileIfExists("/proc/sys/fs/file-nr", file_nr); openFileIfExists("/proc/sys/fs/file-nr", file_nr);
openFileIfExists("/proc/uptime", uptime);
openFileIfExists("/proc/net/dev", net_dev); openFileIfExists("/proc/net/dev", net_dev);
/// CGroups v2 /// CGroups v2
openFileIfExists("/sys/fs/cgroup/memory.max", cgroupmem_limit_in_bytes); openCgroupv2MetricFile("memory.max", cgroupmem_limit_in_bytes);
if (cgroupmem_limit_in_bytes) openCgroupv2MetricFile("memory.current", cgroupmem_usage_in_bytes);
{ openCgroupv2MetricFile("cpu.max", cgroupcpu_max);
openFileIfExists("/sys/fs/cgroup/memory.current", cgroupmem_usage_in_bytes); openCgroupv2MetricFile("cpu.stat", cgroupcpu_stat);
}
openFileIfExists("/sys/fs/cgroup/cpu.max", cgroupcpu_max);
/// CGroups v1 /// CGroups v1
if (!cgroupmem_limit_in_bytes) if (!cgroupmem_limit_in_bytes)
@ -90,6 +96,21 @@ AsynchronousMetrics::AsynchronousMetrics(
openFileIfExists("/sys/fs/cgroup/cpu/cpu.cfs_period_us", cgroupcpu_cfs_period); openFileIfExists("/sys/fs/cgroup/cpu/cpu.cfs_period_us", cgroupcpu_cfs_period);
openFileIfExists("/sys/fs/cgroup/cpu/cpu.cfs_quota_us", cgroupcpu_cfs_quota); openFileIfExists("/sys/fs/cgroup/cpu/cpu.cfs_quota_us", cgroupcpu_cfs_quota);
} }
if (!cgroupcpu_stat)
openFileIfExists("/sys/fs/cgroup/cpuacct/cpuacct.stat", cgroupcpuacct_stat);
if (!cgroupcpu_stat && !cgroupcpuacct_stat)
{
/// The following metrics are not cgroup-aware and we've found cgroup-specific metric files for the similar metrics,
/// so we're better not reporting them at all to avoid confusion
openFileIfExists("/proc/loadavg", loadavg);
openFileIfExists("/proc/stat", proc_stat);
openFileIfExists("/proc/uptime", uptime);
}
/// The same story for memory metrics
if (!cgroupmem_limit_in_bytes)
openFileIfExists("/proc/meminfo", meminfo);
openFileIfExists("/proc/sys/vm/max_map_count", vm_max_map_count); openFileIfExists("/proc/sys/vm/max_map_count", vm_max_map_count);
openFileIfExists("/proc/self/maps", vm_maps); openFileIfExists("/proc/self/maps", vm_maps);
@ -570,6 +591,151 @@ AsynchronousMetrics::NetworkInterfaceStatValues::operator-(const AsynchronousMet
#endif #endif
#if defined(OS_LINUX)
void AsynchronousMetrics::applyCPUMetricsUpdate(
AsynchronousMetricValues & new_values, const std::string & cpu_suffix, const ProcStatValuesCPU & delta_values, double multiplier)
{
new_values["OSUserTime" + cpu_suffix]
= {delta_values.user * multiplier,
"The ratio of time the CPU core was running userspace code. This is a system-wide metric, it includes all the processes on the "
"host machine, not just clickhouse-server."
" This includes also the time when the CPU was under-utilized due to the reasons internal to the CPU (memory loads, pipeline "
"stalls, branch mispredictions, running another SMT core)."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
new_values["OSNiceTime" + cpu_suffix]
= {delta_values.nice * multiplier,
"The ratio of time the CPU core was running userspace code with higher priority. This is a system-wide metric, it includes all "
"the processes on the host machine, not just clickhouse-server."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
new_values["OSSystemTime" + cpu_suffix]
= {delta_values.system * multiplier,
"The ratio of time the CPU core was running OS kernel (system) code. This is a system-wide metric, it includes all the "
"processes on the host machine, not just clickhouse-server."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
new_values["OSIdleTime" + cpu_suffix]
= {delta_values.idle * multiplier,
"The ratio of time the CPU core was idle (not even ready to run a process waiting for IO) from the OS kernel standpoint. This "
"is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" This does not include the time when the CPU was under-utilized due to the reasons internal to the CPU (memory loads, pipeline "
"stalls, branch mispredictions, running another SMT core)."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
new_values["OSIOWaitTime" + cpu_suffix]
= {delta_values.iowait * multiplier,
"The ratio of time the CPU core was not running the code but when the OS kernel did not run any other process on this CPU as "
"the processes were waiting for IO. This is a system-wide metric, it includes all the processes on the host machine, not just "
"clickhouse-server."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
new_values["OSIrqTime" + cpu_suffix]
= {delta_values.irq * multiplier,
"The ratio of time spent for running hardware interrupt requests on the CPU. This is a system-wide metric, it includes all the "
"processes on the host machine, not just clickhouse-server."
" A high number of this metric may indicate hardware misconfiguration or a very high network load."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
new_values["OSSoftIrqTime" + cpu_suffix]
= {delta_values.softirq * multiplier,
"The ratio of time spent for running software interrupt requests on the CPU. This is a system-wide metric, it includes all the "
"processes on the host machine, not just clickhouse-server."
" A high number of this metric may indicate inefficient software running on the system."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
new_values["OSStealTime" + cpu_suffix]
= {delta_values.steal * multiplier,
"The ratio of time spent in other operating systems by the CPU when running in a virtualized environment. This is a system-wide "
"metric, it includes all the processes on the host machine, not just clickhouse-server."
" Not every virtualized environments present this metric, and most of them don't."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
new_values["OSGuestTime" + cpu_suffix]
= {delta_values.guest * multiplier,
"The ratio of time spent running a virtual CPU for guest operating systems under the control of the Linux kernel (See `man "
"procfs`). This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" This metric is irrelevant for ClickHouse, but still exists for completeness."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
new_values["OSGuestNiceTime" + cpu_suffix]
= {delta_values.guest_nice * multiplier,
"The ratio of time spent running a virtual CPU for guest operating systems under the control of the Linux kernel, when a guest "
"was set to a higher priority (See `man procfs`). This is a system-wide metric, it includes all the processes on the host "
"machine, not just clickhouse-server."
" This metric is irrelevant for ClickHouse, but still exists for completeness."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
}
void AsynchronousMetrics::applyNormalizedCPUMetricsUpdate(
AsynchronousMetricValues & new_values, double num_cpus_to_normalize, const ProcStatValuesCPU & delta_values_all_cpus, double multiplier)
{
chassert(num_cpus_to_normalize);
new_values["OSUserTimeNormalized"]
= {delta_values_all_cpus.user * multiplier / num_cpus_to_normalize,
"The value is similar to `OSUserTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
"of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
new_values["OSNiceTimeNormalized"]
= {delta_values_all_cpus.nice * multiplier / num_cpus_to_normalize,
"The value is similar to `OSNiceTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
"of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
new_values["OSSystemTimeNormalized"]
= {delta_values_all_cpus.system * multiplier / num_cpus_to_normalize,
"The value is similar to `OSSystemTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
"of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
new_values["OSIdleTimeNormalized"]
= {delta_values_all_cpus.idle * multiplier / num_cpus_to_normalize,
"The value is similar to `OSIdleTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
"of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
new_values["OSIOWaitTimeNormalized"]
= {delta_values_all_cpus.iowait * multiplier / num_cpus_to_normalize,
"The value is similar to `OSIOWaitTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
"of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
new_values["OSIrqTimeNormalized"]
= {delta_values_all_cpus.irq * multiplier / num_cpus_to_normalize,
"The value is similar to `OSIrqTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of "
"the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
new_values["OSSoftIrqTimeNormalized"]
= {delta_values_all_cpus.softirq * multiplier / num_cpus_to_normalize,
"The value is similar to `OSSoftIrqTime` but divided to the number of CPU cores to be measured in the [0..1] interval "
"regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
new_values["OSStealTimeNormalized"]
= {delta_values_all_cpus.steal * multiplier / num_cpus_to_normalize,
"The value is similar to `OSStealTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
"of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
new_values["OSGuestTimeNormalized"]
= {delta_values_all_cpus.guest * multiplier / num_cpus_to_normalize,
"The value is similar to `OSGuestTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
"of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
new_values["OSGuestNiceTimeNormalized"]
= {delta_values_all_cpus.guest_nice * multiplier / num_cpus_to_normalize,
"The value is similar to `OSGuestNiceTime` but divided to the number of CPU cores to be measured in the [0..1] interval "
"regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
}
#endif
void AsynchronousMetrics::update(TimePoint update_time, bool force_update) void AsynchronousMetrics::update(TimePoint update_time, bool force_update)
{ {
Stopwatch watch; Stopwatch watch;
@ -831,7 +997,68 @@ void AsynchronousMetrics::update(TimePoint update_time, bool force_update)
new_values["CGroupMaxCPU"] = { max_cpu_cgroups, "The maximum number of CPU cores according to CGroups."}; new_values["CGroupMaxCPU"] = { max_cpu_cgroups, "The maximum number of CPU cores according to CGroups."};
} }
if (proc_stat) if (cgroupcpu_stat || cgroupcpuacct_stat)
{
try
{
ReadBufferFromFilePRead & in = cgroupcpu_stat ? *cgroupcpu_stat : *cgroupcpuacct_stat;
ProcStatValuesCPU current_values{};
/// We re-read the file from the beginning each time
in.rewind();
while (!in.eof())
{
String name;
readStringUntilWhitespace(name, in);
skipWhitespaceIfAny(in);
/// `user_usec` for cgroup v2 and `user` for cgroup v1
if (name.starts_with("user"))
{
readText(current_values.user, in);
skipToNextLineOrEOF(in);
}
/// `system_usec` for cgroup v2 and `system` for cgroup v1
else if (name.starts_with("system"))
{
readText(current_values.system, in);
skipToNextLineOrEOF(in);
}
else
skipToNextLineOrEOF(in);
}
if (!first_run)
{
auto get_clock_ticks = [&]()
{
if (auto hz = sysconf(_SC_CLK_TCK); hz != -1)
return hz;
else
throw ErrnoException(ErrorCodes::CANNOT_SYSCONF, "Cannot call 'sysconf' to obtain system HZ");
};
const auto cgroup_version_specific_divisor = cgroupcpu_stat ? 1e6 : get_clock_ticks();
const double multiplier = 1.0 / cgroup_version_specific_divisor
/ (std::chrono::duration_cast<std::chrono::nanoseconds>(time_since_previous_update).count() / 1e9);
const ProcStatValuesCPU delta_values = current_values - proc_stat_values_all_cpus;
applyCPUMetricsUpdate(new_values, /*cpu_suffix=*/"", delta_values, multiplier);
if (max_cpu_cgroups > 0)
applyNormalizedCPUMetricsUpdate(new_values, max_cpu_cgroups, delta_values, multiplier);
}
proc_stat_values_all_cpus = current_values;
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
openCgroupv2MetricFile("cpu.stat", cgroupcpu_stat);
if (!cgroupcpu_stat)
openFileIfExists("/sys/fs/cgroup/cpuacct/cpuacct.stat", cgroupcpuacct_stat);
}
}
else if (proc_stat)
{ {
try try
{ {
@ -886,43 +1113,7 @@ void AsynchronousMetrics::update(TimePoint update_time, bool force_update)
else else
delta_values_all_cpus = delta_values; delta_values_all_cpus = delta_values;
new_values["OSUserTime" + cpu_suffix] = { delta_values.user * multiplier, applyCPUMetricsUpdate(new_values, cpu_suffix, delta_values, multiplier);
"The ratio of time the CPU core was running userspace code. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" This includes also the time when the CPU was under-utilized due to the reasons internal to the CPU (memory loads, pipeline stalls, branch mispredictions, running another SMT core)."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
new_values["OSNiceTime" + cpu_suffix] = { delta_values.nice * multiplier,
"The ratio of time the CPU core was running userspace code with higher priority. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
new_values["OSSystemTime" + cpu_suffix] = { delta_values.system * multiplier,
"The ratio of time the CPU core was running OS kernel (system) code. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
new_values["OSIdleTime" + cpu_suffix] = { delta_values.idle * multiplier,
"The ratio of time the CPU core was idle (not even ready to run a process waiting for IO) from the OS kernel standpoint. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" This does not include the time when the CPU was under-utilized due to the reasons internal to the CPU (memory loads, pipeline stalls, branch mispredictions, running another SMT core)."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
new_values["OSIOWaitTime" + cpu_suffix] = { delta_values.iowait * multiplier,
"The ratio of time the CPU core was not running the code but when the OS kernel did not run any other process on this CPU as the processes were waiting for IO. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
new_values["OSIrqTime" + cpu_suffix] = { delta_values.irq * multiplier,
"The ratio of time spent for running hardware interrupt requests on the CPU. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" A high number of this metric may indicate hardware misconfiguration or a very high network load."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
new_values["OSSoftIrqTime" + cpu_suffix] = { delta_values.softirq * multiplier,
"The ratio of time spent for running software interrupt requests on the CPU. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" A high number of this metric may indicate inefficient software running on the system."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
new_values["OSStealTime" + cpu_suffix] = { delta_values.steal * multiplier,
"The ratio of time spent in other operating systems by the CPU when running in a virtualized environment. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" Not every virtualized environments present this metric, and most of them don't."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
new_values["OSGuestTime" + cpu_suffix] = { delta_values.guest * multiplier,
"The ratio of time spent running a virtual CPU for guest operating systems under the control of the Linux kernel (See `man procfs`). This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" This metric is irrelevant for ClickHouse, but still exists for completeness."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
new_values["OSGuestNiceTime" + cpu_suffix] = { delta_values.guest_nice * multiplier,
"The ratio of time spent running a virtual CPU for guest operating systems under the control of the Linux kernel, when a guest was set to a higher priority (See `man procfs`). This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" This metric is irrelevant for ClickHouse, but still exists for completeness."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
} }
prev_values = current_values; prev_values = current_values;
@ -978,38 +1169,7 @@ void AsynchronousMetrics::update(TimePoint update_time, bool force_update)
Float64 num_cpus_to_normalize = max_cpu_cgroups > 0 ? max_cpu_cgroups : num_cpus; Float64 num_cpus_to_normalize = max_cpu_cgroups > 0 ? max_cpu_cgroups : num_cpus;
if (num_cpus_to_normalize > 0) if (num_cpus_to_normalize > 0)
{ applyNormalizedCPUMetricsUpdate(new_values, num_cpus_to_normalize, delta_values_all_cpus, multiplier);
new_values["OSUserTimeNormalized"] = { delta_values_all_cpus.user * multiplier / num_cpus_to_normalize,
"The value is similar to `OSUserTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
new_values["OSNiceTimeNormalized"] = { delta_values_all_cpus.nice * multiplier / num_cpus_to_normalize,
"The value is similar to `OSNiceTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
new_values["OSSystemTimeNormalized"] = { delta_values_all_cpus.system * multiplier / num_cpus_to_normalize,
"The value is similar to `OSSystemTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
new_values["OSIdleTimeNormalized"] = { delta_values_all_cpus.idle * multiplier / num_cpus_to_normalize,
"The value is similar to `OSIdleTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
new_values["OSIOWaitTimeNormalized"] = { delta_values_all_cpus.iowait * multiplier / num_cpus_to_normalize,
"The value is similar to `OSIOWaitTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
new_values["OSIrqTimeNormalized"] = { delta_values_all_cpus.irq * multiplier / num_cpus_to_normalize,
"The value is similar to `OSIrqTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
new_values["OSSoftIrqTimeNormalized"] = { delta_values_all_cpus.softirq * multiplier / num_cpus_to_normalize,
"The value is similar to `OSSoftIrqTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
new_values["OSStealTimeNormalized"] = { delta_values_all_cpus.steal * multiplier / num_cpus_to_normalize,
"The value is similar to `OSStealTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
new_values["OSGuestTimeNormalized"] = { delta_values_all_cpus.guest * multiplier / num_cpus_to_normalize,
"The value is similar to `OSGuestTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
new_values["OSGuestNiceTimeNormalized"] = { delta_values_all_cpus.guest_nice * multiplier / num_cpus_to_normalize,
"The value is similar to `OSGuestNiceTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
}
} }
proc_stat_values_other = current_other_values; proc_stat_values_other = current_other_values;
@ -1042,8 +1202,7 @@ void AsynchronousMetrics::update(TimePoint update_time, bool force_update)
tryLogCurrentException(__PRETTY_FUNCTION__); tryLogCurrentException(__PRETTY_FUNCTION__);
} }
} }
else if (meminfo)
if (meminfo)
{ {
try try
{ {

View File

@ -126,6 +126,8 @@ private:
std::optional<ReadBufferFromFilePRead> cgroupcpu_cfs_period TSA_GUARDED_BY(data_mutex); std::optional<ReadBufferFromFilePRead> cgroupcpu_cfs_period TSA_GUARDED_BY(data_mutex);
std::optional<ReadBufferFromFilePRead> cgroupcpu_cfs_quota TSA_GUARDED_BY(data_mutex); std::optional<ReadBufferFromFilePRead> cgroupcpu_cfs_quota TSA_GUARDED_BY(data_mutex);
std::optional<ReadBufferFromFilePRead> cgroupcpu_max TSA_GUARDED_BY(data_mutex); std::optional<ReadBufferFromFilePRead> cgroupcpu_max TSA_GUARDED_BY(data_mutex);
std::optional<ReadBufferFromFilePRead> cgroupcpu_stat TSA_GUARDED_BY(data_mutex);
std::optional<ReadBufferFromFilePRead> cgroupcpuacct_stat TSA_GUARDED_BY(data_mutex);
std::optional<ReadBufferFromFilePRead> vm_max_map_count TSA_GUARDED_BY(data_mutex); std::optional<ReadBufferFromFilePRead> vm_max_map_count TSA_GUARDED_BY(data_mutex);
std::optional<ReadBufferFromFilePRead> vm_maps TSA_GUARDED_BY(data_mutex); std::optional<ReadBufferFromFilePRead> vm_maps TSA_GUARDED_BY(data_mutex);
@ -221,6 +223,16 @@ private:
void openBlockDevices(); void openBlockDevices();
void openSensorsChips(); void openSensorsChips();
void openEDAC(); void openEDAC();
void applyCPUMetricsUpdate(
AsynchronousMetricValues & new_values, const std::string & cpu_suffix, const ProcStatValuesCPU & delta_values, double multiplier);
void applyNormalizedCPUMetricsUpdate(
AsynchronousMetricValues & new_values,
double num_cpus_to_normalize,
const ProcStatValuesCPU & delta_values_all_cpus,
double multiplier);
#endif #endif
void run(); void run();

View File

@ -144,31 +144,6 @@ private:
/// - I did not test what happens if a host has v1 and v2 simultaneously enabled. I believe such /// - I did not test what happens if a host has v1 and v2 simultaneously enabled. I believe such
/// systems existed only for a short transition period. /// systems existed only for a short transition period.
std::optional<std::string> getCgroupsV2Path()
{
if (!cgroupsV2Enabled())
return {};
if (!cgroupsV2MemoryControllerEnabled())
return {};
fs::path current_cgroup = cgroupV2PathOfProcess();
if (current_cgroup.empty())
return {};
/// Return the bottom-most nested current memory file. If there is no such file at the current
/// level, try again at the parent level as memory settings are inherited.
while (current_cgroup != default_cgroups_mount.parent_path())
{
const auto current_path = current_cgroup / "memory.current";
const auto stat_path = current_cgroup / "memory.stat";
if (fs::exists(current_path) && fs::exists(stat_path))
return {current_cgroup};
current_cgroup = current_cgroup.parent_path();
}
return {};
}
std::optional<std::string> getCgroupsV1Path() std::optional<std::string> getCgroupsV1Path()
{ {
auto path = default_cgroups_mount / "memory/memory.stat"; auto path = default_cgroups_mount / "memory/memory.stat";
@ -179,7 +154,7 @@ std::optional<std::string> getCgroupsV1Path()
std::pair<std::string, CgroupsMemoryUsageObserver::CgroupsVersion> getCgroupsPath() std::pair<std::string, CgroupsMemoryUsageObserver::CgroupsVersion> getCgroupsPath()
{ {
auto v2_path = getCgroupsV2Path(); auto v2_path = getCgroupsV2PathContainingFile("memory.current");
if (v2_path.has_value()) if (v2_path.has_value())
return {*v2_path, CgroupsMemoryUsageObserver::CgroupsVersion::V2}; return {*v2_path, CgroupsMemoryUsageObserver::CgroupsVersion::V2};

View File

@ -0,0 +1,69 @@
import pytest
from helpers.cluster import ClickHouseCluster
cluster = ClickHouseCluster(__file__)
node1 = cluster.add_instance("node1", stay_alive=True)
node2 = cluster.add_instance("node2", stay_alive=True)
@pytest.fixture(scope="module")
def start_cluster():
try:
cluster.start()
yield cluster
finally:
cluster.shutdown()
def run_cpu_intensive_task(node):
node.query(
"SELECT sum(*) FROM system.numbers_mt FORMAT Null SETTINGS max_execution_time=10",
ignore_error=True,
)
def get_async_metric(node, metric):
node.query("SYSTEM FLUSH LOGS")
return node.query(
f"""
SELECT max(value)
FROM (
SELECT toStartOfInterval(event_time, toIntervalSecond(1)) AS t, avg(value) AS value
FROM system.asynchronous_metric_log
WHERE event_time >= now() - 60 AND metric = '{metric}'
GROUP BY t
)
SETTINGS max_threads = 1
"""
).strip("\n")
def test_user_cpu_accounting(start_cluster):
if node1.is_built_with_sanitizer():
pytest.skip("Disabled for sanitizers")
# run query on the other node, its usage shouldn't be accounted by node1
run_cpu_intensive_task(node2)
node1_cpu_time = get_async_metric(node1, "OSUserTime")
assert float(node1_cpu_time) < 2
# then let's test that we will account cpu time spent by the server itself
node2_cpu_time = get_async_metric(node2, "OSUserTime")
# this check is really weak, but CI is tough place and we cannot guarantee that test process will get many cpu time
assert float(node2_cpu_time) > 2
def test_normalized_user_cpu(start_cluster):
if node1.is_built_with_sanitizer():
pytest.skip("Disabled for sanitizers")
# run query on the other node, its usage shouldn't be accounted by node1
run_cpu_intensive_task(node2)
node1_cpu_time = get_async_metric(node1, "OSUserTimeNormalized")
assert float(node1_cpu_time) < 1.01
node2_cpu_time = get_async_metric(node2, "OSUserTimeNormalized")
assert float(node2_cpu_time) < 1.01