Merge pull request #62003 from nickitat/cpu_cgroup_aware

Fix cpu metric collection in cgroups
2024-12-18 04:12:19 +00:00 · 2024-08-14 11:08:01 +00:00 · 2024-08-14 11:08:01 +00:00 · 3c5d588302
commit 3c5d588302
parent 2b1fce007f b5134fd490
8 changed files with 360 additions and 143 deletions
--- a/base/base/cgroupsv2.cpp
+++ b/base/base/cgroupsv2.cpp
@ -27,27 +27,6 @@ bool cgroupsV2Enabled()
 #endif
 }
 bool cgroupsV2MemoryControllerEnabled()
 {
 #if defined(OS_LINUX)
    chassert(cgroupsV2Enabled());
    /// According to https://docs.kernel.org/admin-guide/cgroup-v2.html, file "cgroup.controllers" defines which controllers are available
    /// for the current + child cgroups. The set of available controllers can be restricted from level to level using file
    /// "cgroups.subtree_control". It is therefore sufficient to check the bottom-most nested "cgroup.controllers" file.
    fs::path cgroup_dir = cgroupV2PathOfProcess();
    if (cgroup_dir.empty())
        return false;
    std::ifstream controllers_file(cgroup_dir / "cgroup.controllers");
    if (!controllers_file.is_open())
        return false;
    std::string controllers;
    std::getline(controllers_file, controllers);
    return controllers.find("memory") != std::string::npos;
 #else
    return false;
 #endif
 }
 fs::path cgroupV2PathOfProcess()
 {
 #if defined(OS_LINUX)
@ -71,3 +50,28 @@ fs::path cgroupV2PathOfProcess()
    return {};
 #endif
 }
 std::optional<std::string> getCgroupsV2PathContainingFile([[maybe_unused]] std::string_view file_name)
 {
 #if defined(OS_LINUX)
    if (!cgroupsV2Enabled())
        return {};
    fs::path current_cgroup = cgroupV2PathOfProcess();
    if (current_cgroup.empty())
        return {};
    /// Return the bottom-most nested file. If there is no such file at the current
    /// level, try again at the parent level as settings are inherited.
    while (current_cgroup != default_cgroups_mount.parent_path())
    {
        const auto path = current_cgroup / file_name;
        if (fs::exists(path))
            return {current_cgroup};
        current_cgroup = current_cgroup.parent_path();
    }
    return {};
 #else
    return {};
 #endif
 }
--- a/base/base/cgroupsv2.h
+++ b/base/base/cgroupsv2.h
@ -1,6 +1,7 @@
 #pragma once
 #include <filesystem>
 #include <string_view>
 #if defined(OS_LINUX)
 /// I think it is possible to mount the cgroups hierarchy somewhere else (e.g. when in containers).
@ -11,11 +12,11 @@ static inline const std::filesystem::path default_cgroups_mount = "/sys/fs/cgrou
 /// Is cgroups v2 enabled on the system?
 bool cgroupsV2Enabled();
 /// Is the memory controller of cgroups v2 enabled on the system?
 /// Assumes that cgroupsV2Enabled() is enabled.
 bool cgroupsV2MemoryControllerEnabled();
 /// Detects which cgroup v2 the process belongs to and returns the filesystem path to the cgroup.
 /// Returns an empty path the cgroup cannot be determined.
 /// Assumes that cgroupsV2Enabled() is enabled.
 std::filesystem::path cgroupV2PathOfProcess();
 /// Returns the most nested cgroup dir containing the specified file.
 /// If cgroups v2 is not enabled - returns an empty optional.
 std::optional<std::string> getCgroupsV2PathContainingFile([[maybe_unused]] std::string_view file_name);
--- a/base/base/getMemoryAmount.cpp
+++ b/base/base/getMemoryAmount.cpp
@ -19,9 +19,6 @@ std::optional<uint64_t> getCgroupsV2MemoryLimit()
    if (!cgroupsV2Enabled())
        return {};
    if (!cgroupsV2MemoryControllerEnabled())
        return {};
    std::filesystem::path current_cgroup = cgroupV2PathOfProcess();
    if (current_cgroup.empty())
        return {};
--- a/src/Common/AsynchronousMetrics.cpp
+++ b/src/Common/AsynchronousMetrics.cpp
@ -1,18 +1,24 @@
 #include <Common/formatReadable.h>
 #include <Common/AsynchronousMetrics.h>
-#include <Common/Exception.h>
+
 #include <Common/setThreadName.h>
 #include <Common/CurrentMetrics.h>
 #include <Common/filesystemHelpers.h>
 #include <Common/logger_useful.h>
 #include <IO/UncompressedCache.h>
 #include <IO/MMappedFileCache.h>
 #include <IO/ReadHelpers.h>
 #include <IO/UncompressedCache.h>
 #include <base/cgroupsv2.h>
 #include <base/errnoToString.h>
 #include <base/find_symbols.h>
 #include <base/getPageSize.h>
 #include <sys/resource.h>
 #include <Common/CurrentMetrics.h>
 #include <Common/Exception.h>
 #include <Common/filesystemHelpers.h>
 #include <Common/formatReadable.h>
 #include <Common/logger_useful.h>
 #include <Common/setThreadName.h>
 #include <boost/locale/date_time_facet.hpp>
 #include <chrono>
 #include <string_view>
 #include "config.h"
@ -52,6 +58,12 @@ static std::unique_ptr<ReadBufferFromFilePRead> openFileIfExists(const std::stri
    return {};
 }
 static void openCgroupv2MetricFile(const std::string & filename, std::optional<ReadBufferFromFilePRead> & out)
 {
    if (auto path = getCgroupsV2PathContainingFile(filename))
        openFileIfExists((path.value() + filename).c_str(), out);
 };
 #endif
@ -63,21 +75,15 @@ AsynchronousMetrics::AsynchronousMetrics(
    , protocol_server_metrics_func(protocol_server_metrics_func_)
 {
 #if defined(OS_LINUX)
    openFileIfExists("/proc/meminfo", meminfo);
    openFileIfExists("/proc/loadavg", loadavg);
    openFileIfExists("/proc/stat", proc_stat);
    openFileIfExists("/proc/cpuinfo", cpuinfo);
    openFileIfExists("/proc/sys/fs/file-nr", file_nr);
    openFileIfExists("/proc/uptime", uptime);
    openFileIfExists("/proc/net/dev", net_dev);
    /// CGroups v2
-    openFileIfExists("/sys/fs/cgroup/memory.max", cgroupmem_limit_in_bytes);
+    openCgroupv2MetricFile("memory.max", cgroupmem_limit_in_bytes);
-    if (cgroupmem_limit_in_bytes)
+    openCgroupv2MetricFile("memory.current", cgroupmem_usage_in_bytes);
-    {
+    openCgroupv2MetricFile("cpu.max", cgroupcpu_max);
-        openFileIfExists("/sys/fs/cgroup/memory.current", cgroupmem_usage_in_bytes);
+    openCgroupv2MetricFile("cpu.stat", cgroupcpu_stat);
    }
    openFileIfExists("/sys/fs/cgroup/cpu.max", cgroupcpu_max);
    /// CGroups v1
    if (!cgroupmem_limit_in_bytes)
@ -90,6 +96,21 @@ AsynchronousMetrics::AsynchronousMetrics(
        openFileIfExists("/sys/fs/cgroup/cpu/cpu.cfs_period_us", cgroupcpu_cfs_period);
        openFileIfExists("/sys/fs/cgroup/cpu/cpu.cfs_quota_us", cgroupcpu_cfs_quota);
    }
    if (!cgroupcpu_stat)
        openFileIfExists("/sys/fs/cgroup/cpuacct/cpuacct.stat", cgroupcpuacct_stat);
    if (!cgroupcpu_stat && !cgroupcpuacct_stat)
    {
        /// The following metrics are not cgroup-aware and we've found cgroup-specific metric files for the similar metrics,
        /// so we're better not reporting them at all to avoid confusion
        openFileIfExists("/proc/loadavg", loadavg);
        openFileIfExists("/proc/stat", proc_stat);
        openFileIfExists("/proc/uptime", uptime);
    }
    /// The same story for memory metrics
    if (!cgroupmem_limit_in_bytes)
        openFileIfExists("/proc/meminfo", meminfo);
    openFileIfExists("/proc/sys/vm/max_map_count", vm_max_map_count);
    openFileIfExists("/proc/self/maps", vm_maps);
@ -570,6 +591,151 @@ AsynchronousMetrics::NetworkInterfaceStatValues::operator-(const AsynchronousMet
 #endif
 #if defined(OS_LINUX)
 void AsynchronousMetrics::applyCPUMetricsUpdate(
    AsynchronousMetricValues & new_values, const std::string & cpu_suffix, const ProcStatValuesCPU & delta_values, double multiplier)
 {
    new_values["OSUserTime" + cpu_suffix]
        = {delta_values.user * multiplier,
           "The ratio of time the CPU core was running userspace code. This is a system-wide metric, it includes all the processes on the "
           "host machine, not just clickhouse-server."
           " This includes also the time when the CPU was under-utilized due to the reasons internal to the CPU (memory loads, pipeline "
           "stalls, branch mispredictions, running another SMT core)."
           " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
           "them [0..num cores]."};
    new_values["OSNiceTime" + cpu_suffix]
        = {delta_values.nice * multiplier,
           "The ratio of time the CPU core was running userspace code with higher priority. This is a system-wide metric, it includes all "
           "the processes on the host machine, not just clickhouse-server."
           " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
           "them [0..num cores]."};
    new_values["OSSystemTime" + cpu_suffix]
        = {delta_values.system * multiplier,
           "The ratio of time the CPU core was running OS kernel (system) code. This is a system-wide metric, it includes all the "
           "processes on the host machine, not just clickhouse-server."
           " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
           "them [0..num cores]."};
    new_values["OSIdleTime" + cpu_suffix]
        = {delta_values.idle * multiplier,
           "The ratio of time the CPU core was idle (not even ready to run a process waiting for IO) from the OS kernel standpoint. This "
           "is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
           " This does not include the time when the CPU was under-utilized due to the reasons internal to the CPU (memory loads, pipeline "
           "stalls, branch mispredictions, running another SMT core)."
           " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
           "them [0..num cores]."};
    new_values["OSIOWaitTime" + cpu_suffix]
        = {delta_values.iowait * multiplier,
           "The ratio of time the CPU core was not running the code but when the OS kernel did not run any other process on this CPU as "
           "the processes were waiting for IO. This is a system-wide metric, it includes all the processes on the host machine, not just "
           "clickhouse-server."
           " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
           "them [0..num cores]."};
    new_values["OSIrqTime" + cpu_suffix]
        = {delta_values.irq * multiplier,
           "The ratio of time spent for running hardware interrupt requests on the CPU. This is a system-wide metric, it includes all the "
           "processes on the host machine, not just clickhouse-server."
           " A high number of this metric may indicate hardware misconfiguration or a very high network load."
           " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
           "them [0..num cores]."};
    new_values["OSSoftIrqTime" + cpu_suffix]
        = {delta_values.softirq * multiplier,
           "The ratio of time spent for running software interrupt requests on the CPU. This is a system-wide metric, it includes all the "
           "processes on the host machine, not just clickhouse-server."
           " A high number of this metric may indicate inefficient software running on the system."
           " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
           "them [0..num cores]."};
    new_values["OSStealTime" + cpu_suffix]
        = {delta_values.steal * multiplier,
           "The ratio of time spent in other operating systems by the CPU when running in a virtualized environment. This is a system-wide "
           "metric, it includes all the processes on the host machine, not just clickhouse-server."
           " Not every virtualized environments present this metric, and most of them don't."
           " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
           "them [0..num cores]."};
    new_values["OSGuestTime" + cpu_suffix]
        = {delta_values.guest * multiplier,
           "The ratio of time spent running a virtual CPU for guest operating systems under the control of the Linux kernel (See `man "
           "procfs`). This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
           " This metric is irrelevant for ClickHouse, but still exists for completeness."
           " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
           "them [0..num cores]."};
    new_values["OSGuestNiceTime" + cpu_suffix]
        = {delta_values.guest_nice * multiplier,
           "The ratio of time spent running a virtual CPU for guest operating systems under the control of the Linux kernel, when a guest "
           "was set to a higher priority (See `man procfs`). This is a system-wide metric, it includes all the processes on the host "
           "machine, not just clickhouse-server."
           " This metric is irrelevant for ClickHouse, but still exists for completeness."
           " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
           "them [0..num cores]."};
 }
 void AsynchronousMetrics::applyNormalizedCPUMetricsUpdate(
    AsynchronousMetricValues & new_values, double num_cpus_to_normalize, const ProcStatValuesCPU & delta_values_all_cpus, double multiplier)
 {
    chassert(num_cpus_to_normalize);
    new_values["OSUserTimeNormalized"]
        = {delta_values_all_cpus.user * multiplier / num_cpus_to_normalize,
           "The value is similar to `OSUserTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
           "of the number of cores."
           " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
           "non-uniform, and still get the average resource utilization metric."};
    new_values["OSNiceTimeNormalized"]
        = {delta_values_all_cpus.nice * multiplier / num_cpus_to_normalize,
           "The value is similar to `OSNiceTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
           "of the number of cores."
           " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
           "non-uniform, and still get the average resource utilization metric."};
    new_values["OSSystemTimeNormalized"]
        = {delta_values_all_cpus.system * multiplier / num_cpus_to_normalize,
           "The value is similar to `OSSystemTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
           "of the number of cores."
           " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
           "non-uniform, and still get the average resource utilization metric."};
    new_values["OSIdleTimeNormalized"]
        = {delta_values_all_cpus.idle * multiplier / num_cpus_to_normalize,
           "The value is similar to `OSIdleTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
           "of the number of cores."
           " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
           "non-uniform, and still get the average resource utilization metric."};
    new_values["OSIOWaitTimeNormalized"]
        = {delta_values_all_cpus.iowait * multiplier / num_cpus_to_normalize,
           "The value is similar to `OSIOWaitTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
           "of the number of cores."
           " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
           "non-uniform, and still get the average resource utilization metric."};
    new_values["OSIrqTimeNormalized"]
        = {delta_values_all_cpus.irq * multiplier / num_cpus_to_normalize,
           "The value is similar to `OSIrqTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of "
           "the number of cores."
           " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
           "non-uniform, and still get the average resource utilization metric."};
    new_values["OSSoftIrqTimeNormalized"]
        = {delta_values_all_cpus.softirq * multiplier / num_cpus_to_normalize,
           "The value is similar to `OSSoftIrqTime` but divided to the number of CPU cores to be measured in the [0..1] interval "
           "regardless of the number of cores."
           " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
           "non-uniform, and still get the average resource utilization metric."};
    new_values["OSStealTimeNormalized"]
        = {delta_values_all_cpus.steal * multiplier / num_cpus_to_normalize,
           "The value is similar to `OSStealTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
           "of the number of cores."
           " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
           "non-uniform, and still get the average resource utilization metric."};
    new_values["OSGuestTimeNormalized"]
        = {delta_values_all_cpus.guest * multiplier / num_cpus_to_normalize,
           "The value is similar to `OSGuestTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
           "of the number of cores."
           " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
           "non-uniform, and still get the average resource utilization metric."};
    new_values["OSGuestNiceTimeNormalized"]
        = {delta_values_all_cpus.guest_nice * multiplier / num_cpus_to_normalize,
           "The value is similar to `OSGuestNiceTime` but divided to the number of CPU cores to be measured in the [0..1] interval "
           "regardless of the number of cores."
           " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
           "non-uniform, and still get the average resource utilization metric."};
 }
 #endif
 void AsynchronousMetrics::update(TimePoint update_time, bool force_update)
 {
    Stopwatch watch;
@ -831,7 +997,68 @@ void AsynchronousMetrics::update(TimePoint update_time, bool force_update)
        new_values["CGroupMaxCPU"] = { max_cpu_cgroups, "The maximum number of CPU cores according to CGroups."};
    }
-    if (proc_stat)
+    if (cgroupcpu_stat || cgroupcpuacct_stat)
    {
        try
        {
            ReadBufferFromFilePRead & in = cgroupcpu_stat ? *cgroupcpu_stat : *cgroupcpuacct_stat;
            ProcStatValuesCPU current_values{};
            /// We re-read the file from the beginning each time
            in.rewind();
            while (!in.eof())
            {
                String name;
                readStringUntilWhitespace(name, in);
                skipWhitespaceIfAny(in);
                /// `user_usec` for cgroup v2 and `user` for cgroup v1
                if (name.starts_with("user"))
                {
                    readText(current_values.user, in);
                    skipToNextLineOrEOF(in);
                }
                /// `system_usec` for cgroup v2 and `system` for cgroup v1
                else if (name.starts_with("system"))
                {
                    readText(current_values.system, in);
                    skipToNextLineOrEOF(in);
                }
                else
                    skipToNextLineOrEOF(in);
            }
            if (!first_run)
            {
                auto get_clock_ticks = [&]()
                {
                    if (auto hz = sysconf(_SC_CLK_TCK); hz != -1)
                        return hz;
                    else
                        throw ErrnoException(ErrorCodes::CANNOT_SYSCONF, "Cannot call 'sysconf' to obtain system HZ");
                };
                const auto cgroup_version_specific_divisor = cgroupcpu_stat ? 1e6 : get_clock_ticks();
                const double multiplier = 1.0 / cgroup_version_specific_divisor
                    / (std::chrono::duration_cast<std::chrono::nanoseconds>(time_since_previous_update).count() / 1e9);
                const ProcStatValuesCPU delta_values = current_values - proc_stat_values_all_cpus;
                applyCPUMetricsUpdate(new_values, /*cpu_suffix=*/"", delta_values, multiplier);
                if (max_cpu_cgroups > 0)
                    applyNormalizedCPUMetricsUpdate(new_values, max_cpu_cgroups, delta_values, multiplier);
            }
            proc_stat_values_all_cpus = current_values;
        }
        catch (...)
        {
            tryLogCurrentException(__PRETTY_FUNCTION__);
            openCgroupv2MetricFile("cpu.stat", cgroupcpu_stat);
            if (!cgroupcpu_stat)
                openFileIfExists("/sys/fs/cgroup/cpuacct/cpuacct.stat", cgroupcpuacct_stat);
        }
    }
    else if (proc_stat)
    {
        try
        {
@ -886,43 +1113,7 @@ void AsynchronousMetrics::update(TimePoint update_time, bool force_update)
                        else
                            delta_values_all_cpus = delta_values;
-                        new_values["OSUserTime" + cpu_suffix] = { delta_values.user * multiplier,
+                        applyCPUMetricsUpdate(new_values, cpu_suffix, delta_values, multiplier);
                            "The ratio of time the CPU core was running userspace code. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
                            " This includes also the time when the CPU was under-utilized due to the reasons internal to the CPU (memory loads, pipeline stalls, branch mispredictions, running another SMT core)."
                            " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
                        new_values["OSNiceTime" + cpu_suffix] = { delta_values.nice * multiplier,
                            "The ratio of time the CPU core was running userspace code with higher priority. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
                            " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
                        new_values["OSSystemTime" + cpu_suffix] = { delta_values.system * multiplier,
                            "The ratio of time the CPU core was running OS kernel (system) code. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
                            " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
                        new_values["OSIdleTime" + cpu_suffix] = { delta_values.idle * multiplier,
                            "The ratio of time the CPU core was idle (not even ready to run a process waiting for IO) from the OS kernel standpoint. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
                            " This does not include the time when the CPU was under-utilized due to the reasons internal to the CPU (memory loads, pipeline stalls, branch mispredictions, running another SMT core)."
                            " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
                        new_values["OSIOWaitTime" + cpu_suffix] = { delta_values.iowait * multiplier,
                            "The ratio of time the CPU core was not running the code but when the OS kernel did not run any other process on this CPU as the processes were waiting for IO. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
                            " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
                        new_values["OSIrqTime" + cpu_suffix] = { delta_values.irq * multiplier,
                            "The ratio of time spent for running hardware interrupt requests on the CPU. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
                            " A high number of this metric may indicate hardware misconfiguration or a very high network load."
                            " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
                        new_values["OSSoftIrqTime" + cpu_suffix] = { delta_values.softirq * multiplier,
                            "The ratio of time spent for running software interrupt requests on the CPU. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
                            " A high number of this metric may indicate inefficient software running on the system."
                            " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
                        new_values["OSStealTime" + cpu_suffix] = { delta_values.steal * multiplier,
                            "The ratio of time spent in other operating systems by the CPU when running in a virtualized environment. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
                            " Not every virtualized environments present this metric, and most of them don't."
                            " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
                        new_values["OSGuestTime" + cpu_suffix] = { delta_values.guest * multiplier,
                            "The ratio of time spent running a virtual CPU for guest operating systems under the control of the Linux kernel (See `man procfs`). This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
                            " This metric is irrelevant for ClickHouse, but still exists for completeness."
                            " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
                        new_values["OSGuestNiceTime" + cpu_suffix] = { delta_values.guest_nice * multiplier,
                            "The ratio of time spent running a virtual CPU for guest operating systems under the control of the Linux kernel, when a guest was set to a higher priority (See `man procfs`). This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
                            " This metric is irrelevant for ClickHouse, but still exists for completeness."
                            " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
                    }
                    prev_values = current_values;
@ -978,38 +1169,7 @@ void AsynchronousMetrics::update(TimePoint update_time, bool force_update)
                Float64 num_cpus_to_normalize = max_cpu_cgroups > 0 ? max_cpu_cgroups : num_cpus;
                if (num_cpus_to_normalize > 0)
-                {
+                    applyNormalizedCPUMetricsUpdate(new_values, num_cpus_to_normalize, delta_values_all_cpus, multiplier);
                    new_values["OSUserTimeNormalized"] = { delta_values_all_cpus.user * multiplier / num_cpus_to_normalize,
                        "The value is similar to `OSUserTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
                        " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
                    new_values["OSNiceTimeNormalized"] = { delta_values_all_cpus.nice * multiplier / num_cpus_to_normalize,
                        "The value is similar to `OSNiceTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
                        " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
                    new_values["OSSystemTimeNormalized"] = { delta_values_all_cpus.system * multiplier / num_cpus_to_normalize,
                        "The value is similar to `OSSystemTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
                        " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
                    new_values["OSIdleTimeNormalized"] = { delta_values_all_cpus.idle * multiplier / num_cpus_to_normalize,
                        "The value is similar to `OSIdleTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
                        " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
                    new_values["OSIOWaitTimeNormalized"] = { delta_values_all_cpus.iowait * multiplier / num_cpus_to_normalize,
                        "The value is similar to `OSIOWaitTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
                        " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
                    new_values["OSIrqTimeNormalized"] = { delta_values_all_cpus.irq * multiplier / num_cpus_to_normalize,
                        "The value is similar to `OSIrqTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
                        " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
                    new_values["OSSoftIrqTimeNormalized"] = { delta_values_all_cpus.softirq * multiplier / num_cpus_to_normalize,
                        "The value is similar to `OSSoftIrqTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
                        " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
                    new_values["OSStealTimeNormalized"] = { delta_values_all_cpus.steal * multiplier / num_cpus_to_normalize,
                        "The value is similar to `OSStealTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
                        " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
                    new_values["OSGuestTimeNormalized"] = { delta_values_all_cpus.guest * multiplier / num_cpus_to_normalize,
                        "The value is similar to `OSGuestTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
                        " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
                    new_values["OSGuestNiceTimeNormalized"] = { delta_values_all_cpus.guest_nice * multiplier / num_cpus_to_normalize,
                        "The value is similar to `OSGuestNiceTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
                        " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
                }
            }
            proc_stat_values_other = current_other_values;
@ -1042,8 +1202,7 @@ void AsynchronousMetrics::update(TimePoint update_time, bool force_update)
            tryLogCurrentException(__PRETTY_FUNCTION__);
        }
    }
-
+    else if (meminfo)
    if (meminfo)
    {
        try
        {
--- a/src/Common/AsynchronousMetrics.h
+++ b/src/Common/AsynchronousMetrics.h
@ -126,6 +126,8 @@ private:
    std::optional<ReadBufferFromFilePRead> cgroupcpu_cfs_period TSA_GUARDED_BY(data_mutex);
    std::optional<ReadBufferFromFilePRead> cgroupcpu_cfs_quota TSA_GUARDED_BY(data_mutex);
    std::optional<ReadBufferFromFilePRead> cgroupcpu_max TSA_GUARDED_BY(data_mutex);
    std::optional<ReadBufferFromFilePRead> cgroupcpu_stat TSA_GUARDED_BY(data_mutex);
    std::optional<ReadBufferFromFilePRead> cgroupcpuacct_stat TSA_GUARDED_BY(data_mutex);
    std::optional<ReadBufferFromFilePRead> vm_max_map_count TSA_GUARDED_BY(data_mutex);
    std::optional<ReadBufferFromFilePRead> vm_maps TSA_GUARDED_BY(data_mutex);
@ -221,6 +223,16 @@ private:
    void openBlockDevices();
    void openSensorsChips();
    void openEDAC();
    void applyCPUMetricsUpdate(
        AsynchronousMetricValues & new_values, const std::string & cpu_suffix, const ProcStatValuesCPU & delta_values, double multiplier);
    void applyNormalizedCPUMetricsUpdate(
        AsynchronousMetricValues & new_values,
        double num_cpus_to_normalize,
        const ProcStatValuesCPU & delta_values_all_cpus,
        double multiplier);
 #endif
    void run();
--- a/src/Common/CgroupsMemoryUsageObserver.cpp
+++ b/src/Common/CgroupsMemoryUsageObserver.cpp
@ -144,31 +144,6 @@ private:
 /// - I did not test what happens if a host has v1 and v2 simultaneously enabled. I believe such
 ///   systems existed only for a short transition period.
 std::optional<std::string> getCgroupsV2Path()
 {
    if (!cgroupsV2Enabled())
        return {};
    if (!cgroupsV2MemoryControllerEnabled())
        return {};
    fs::path current_cgroup = cgroupV2PathOfProcess();
    if (current_cgroup.empty())
        return {};
    /// Return the bottom-most nested current memory file. If there is no such file at the current
    /// level, try again at the parent level as memory settings are inherited.
    while (current_cgroup != default_cgroups_mount.parent_path())
    {
        const auto current_path = current_cgroup / "memory.current";
        const auto stat_path = current_cgroup / "memory.stat";
        if (fs::exists(current_path) && fs::exists(stat_path))
            return {current_cgroup};
        current_cgroup = current_cgroup.parent_path();
    }
    return {};
 }
 std::optional<std::string> getCgroupsV1Path()
 {
    auto path = default_cgroups_mount / "memory/memory.stat";
@ -179,7 +154,7 @@ std::optional<std::string> getCgroupsV1Path()
 std::pair<std::string, CgroupsMemoryUsageObserver::CgroupsVersion> getCgroupsPath()
 {
-    auto v2_path = getCgroupsV2Path();
+    auto v2_path = getCgroupsV2PathContainingFile("memory.current");
    if (v2_path.has_value())
        return {*v2_path, CgroupsMemoryUsageObserver::CgroupsVersion::V2};
--- a/tests/integration/test_async_metrics_in_cgroup/init.py
+++ b/tests/integration/test_async_metrics_in_cgroup/init.py
--- a/tests/integration/test_async_metrics_in_cgroup/test.py
+++ b/tests/integration/test_async_metrics_in_cgroup/test.py
@ -0,0 +1,69 @@
 import pytest
 from helpers.cluster import ClickHouseCluster
 cluster = ClickHouseCluster(__file__)
 node1 = cluster.add_instance("node1", stay_alive=True)
 node2 = cluster.add_instance("node2", stay_alive=True)
@pytest.fixture(scope="module")
 def start_cluster():
    try:
        cluster.start()
        yield cluster
    finally:
        cluster.shutdown()
 def run_cpu_intensive_task(node):
    node.query(
        "SELECT sum(*) FROM system.numbers_mt FORMAT Null SETTINGS max_execution_time=10",
        ignore_error=True,
    )
 def get_async_metric(node, metric):
    node.query("SYSTEM FLUSH LOGS")
    return node.query(
        f"""
        SELECT max(value)
            FROM (
            SELECT toStartOfInterval(event_time, toIntervalSecond(1)) AS t, avg(value) AS value
                FROM system.asynchronous_metric_log
            WHERE event_time >= now() - 60 AND metric = '{metric}'
            GROUP BY t
            )
        SETTINGS max_threads = 1
        """
    ).strip("\n")
 def test_user_cpu_accounting(start_cluster):
    if node1.is_built_with_sanitizer():
        pytest.skip("Disabled for sanitizers")
    # run query on the other node, its usage shouldn't be accounted by node1
    run_cpu_intensive_task(node2)
    node1_cpu_time = get_async_metric(node1, "OSUserTime")
    assert float(node1_cpu_time) < 2
    # then let's test that we will account cpu time spent by the server itself
    node2_cpu_time = get_async_metric(node2, "OSUserTime")
    # this check is really weak, but CI is tough place and we cannot guarantee that test process will get many cpu time
    assert float(node2_cpu_time) > 2
 def test_normalized_user_cpu(start_cluster):
    if node1.is_built_with_sanitizer():
        pytest.skip("Disabled for sanitizers")
    # run query on the other node, its usage shouldn't be accounted by node1
    run_cpu_intensive_task(node2)
    node1_cpu_time = get_async_metric(node1, "OSUserTimeNormalized")
    assert float(node1_cpu_time) < 1.01
    node2_cpu_time = get_async_metric(node2, "OSUserTimeNormalized")
    assert float(node2_cpu_time) < 1.01