Merge pull request #59435 from rschu1ze/nested-memory-cgroups

Consider nested cgroup memory limits for the total memory tracker
This commit is contained in:
robot-ch-test-poll4 2024-02-06 20:18:00 +01:00 committed by GitHub
commit 3af96c6970
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 113 additions and 43 deletions

View File

@ -1,8 +1,11 @@
#include <stdexcept>
#include <fstream>
#include <base/getMemoryAmount.h>
#include <base/getPageSize.h>
#include <fstream>
#include <sstream>
#include <stdexcept>
#include <unistd.h>
#include <sys/types.h>
#include <sys/param.h>
@ -11,6 +14,80 @@
#endif
namespace
{
std::optional<uint64_t> getCgroupsV2MemoryLimit()
{
#if defined(OS_LINUX)
const std::filesystem::path default_cgroups_mount = "/sys/fs/cgroup";
/// This file exists iff the host has cgroups v2 enabled.
std::ifstream controllers_file(default_cgroups_mount / "cgroup.controllers");
if (!controllers_file.is_open())
return {};
/// Make sure that the memory controller is enabled.
/// - cgroup.controllers defines which controllers *can* be enabled.
/// - cgroup.subtree_control defines which controllers *are* enabled.
/// (see https://docs.kernel.org/admin-guide/cgroup-v2.html)
/// Caveat: nested groups may disable controllers. For simplicity, check only the top-level group.
/// ReadBufferFromFile subtree_control_file(default_cgroups_mount / "cgroup.subtree_control");
/// std::string subtree_control;
/// readString(subtree_control, subtree_control_file);
/// if (subtree_control.find("memory") == std::string::npos)
/// return {};
std::ifstream subtree_control_file(default_cgroups_mount / "cgroup.subtree_control");
std::stringstream subtree_control_buf;
subtree_control_buf << subtree_control_file.rdbuf();
std::string subtree_control = subtree_control_buf.str();
if (subtree_control.find("memory") == std::string::npos)
return {};
/// Identify the cgroup the process belongs to
/// All PIDs assigned to a cgroup are in /sys/fs/cgroups/{cgroup_name}/cgroup.procs
/// A simpler way to get the membership is:
std::ifstream cgroup_name_file("/proc/self/cgroup");
if (!cgroup_name_file.is_open())
return {};
std::stringstream cgroup_name_buf;
cgroup_name_buf << cgroup_name_file.rdbuf();
std::string cgroup_name = cgroup_name_buf.str();
if (!cgroup_name.empty() && cgroup_name.back() == '\n')
cgroup_name.pop_back(); /// remove trailing newline, if any
/// With cgroups v2, there will be a *single* line with prefix "0::/"
const std::string v2_prefix = "0::/";
if (!cgroup_name.starts_with(v2_prefix))
return {};
cgroup_name = cgroup_name.substr(v2_prefix.length());
std::filesystem::path current_cgroup = cgroup_name.empty() ? default_cgroups_mount : (default_cgroups_mount / cgroup_name);
/// Open the bottom-most nested memory limit setting file. If there is no such file at the current
/// level, try again at the parent level as memory settings are inherited.
while (current_cgroup != default_cgroups_mount.parent_path())
{
std::ifstream setting_file(current_cgroup / "memory.max");
if (setting_file.is_open())
{
uint64_t value;
if (setting_file >> value)
return {value};
else
return {}; /// e.g. the cgroups default "max"
}
current_cgroup = current_cgroup.parent_path();
}
return {};
#else
return {};
#endif
}
}
/** Returns the size of physical memory (RAM) in bytes.
* Returns 0 on unsupported platform
*/
@ -26,34 +103,27 @@ uint64_t getMemoryAmountOrZero()
uint64_t memory_amount = num_pages * page_size;
#if defined(OS_LINUX)
// Try to lookup at the Cgroup limit
// CGroups v2
std::ifstream cgroupv2_limit("/sys/fs/cgroup/memory.max");
if (cgroupv2_limit.is_open())
{
uint64_t memory_limit = 0;
cgroupv2_limit >> memory_limit;
if (memory_limit > 0 && memory_limit < memory_amount)
memory_amount = memory_limit;
}
/// Respect the memory limit set by cgroups v2.
auto limit_v2 = getCgroupsV2MemoryLimit();
if (limit_v2.has_value() && *limit_v2 < memory_amount)
memory_amount = *limit_v2;
else
{
// CGroups v1
std::ifstream cgroup_limit("/sys/fs/cgroup/memory/memory.limit_in_bytes");
if (cgroup_limit.is_open())
/// Cgroups v1 were replaced by v2 in 2015. The only reason we keep supporting v1 is that the transition to v2
/// has been slow. Caveat : Hierarchical groups as in v2 are not supported for v1, the location of the memory
/// limit (virtual) file is hard-coded.
/// TODO: check at the end of 2024 if we can get rid of v1.
std::ifstream limit_file_v1("/sys/fs/cgroup/memory/memory.limit_in_bytes");
if (limit_file_v1.is_open())
{
uint64_t memory_limit = 0; // in case of read error
cgroup_limit >> memory_limit;
if (memory_limit > 0 && memory_limit < memory_amount)
memory_amount = memory_limit;
uint64_t limit_v1;
if (limit_file_v1 >> limit_v1)
if (limit_v1 < memory_amount)
memory_amount = limit_v1;
}
}
#endif
return memory_amount;
}

View File

@ -1,7 +1,6 @@
#include "getNumberOfPhysicalCPUCores.h"
#include <filesystem>
#include "config.h"
#if defined(OS_LINUX)
# include <cmath>
# include <fstream>
@ -34,9 +33,9 @@ int32_t readFrom(const std::filesystem::path & filename, int default_value)
uint32_t getCGroupLimitedCPUCores(unsigned default_cpu_count)
{
uint32_t quota_count = default_cpu_count;
std::filesystem::path prefix = "/sys/fs/cgroup";
std::filesystem::path default_cgroups_mount = "/sys/fs/cgroup";
/// cgroupsv2
std::ifstream contr_file(prefix / "cgroup.controllers");
std::ifstream contr_file(default_cgroups_mount / "cgroup.controllers");
if (contr_file.is_open())
{
/// First, we identify the cgroup the process belongs
@ -51,16 +50,15 @@ uint32_t getCGroupLimitedCPUCores(unsigned default_cpu_count)
std::filesystem::path current_cgroup;
if (cgroup_name.empty())
current_cgroup = prefix;
current_cgroup = default_cgroups_mount;
else
current_cgroup = prefix / cgroup_name;
current_cgroup = default_cgroups_mount / cgroup_name;
// Looking for cpu.max in directories from the current cgroup to the top level
// It does not stop on the first time since the child could have a greater value than parent
while (current_cgroup != prefix.parent_path())
while (current_cgroup != default_cgroups_mount.parent_path())
{
std::ifstream cpu_max_file(current_cgroup / "cpu.max");
current_cgroup = current_cgroup.parent_path();
if (cpu_max_file.is_open())
{
std::string cpu_limit_str;
@ -72,10 +70,11 @@ uint32_t getCGroupLimitedCPUCores(unsigned default_cpu_count)
quota_count = std::min(static_cast<uint32_t>(ceil(cpu_limit / cpu_period)), quota_count);
}
}
current_cgroup = current_cgroup.parent_path();
}
current_cgroup = prefix / cgroup_name;
current_cgroup = default_cgroups_mount / cgroup_name;
// Looking for cpuset.cpus.effective in directories from the current cgroup to the top level
while (current_cgroup != prefix.parent_path())
while (current_cgroup != default_cgroups_mount.parent_path())
{
std::ifstream cpuset_cpus_file(current_cgroup / "cpuset.cpus.effective");
current_cgroup = current_cgroup.parent_path();
@ -113,8 +112,8 @@ uint32_t getCGroupLimitedCPUCores(unsigned default_cpu_count)
/// cgroupsv1
/// Return the number of milliseconds per period process is guaranteed to run.
/// -1 for no quota
int cgroup_quota = readFrom(prefix / "cpu/cpu.cfs_quota_us", -1);
int cgroup_period = readFrom(prefix / "cpu/cpu.cfs_period_us", -1);
int cgroup_quota = readFrom(default_cgroups_mount / "cpu/cpu.cfs_quota_us", -1);
int cgroup_period = readFrom(default_cgroups_mount / "cpu/cpu.cfs_period_us", -1);
if (cgroup_quota > -1 && cgroup_period > 0)
quota_count = static_cast<uint32_t>(ceil(static_cast<float>(cgroup_quota) / static_cast<float>(cgroup_period)));
@ -178,24 +177,25 @@ catch (...)
unsigned getNumberOfPhysicalCPUCoresImpl()
{
unsigned cpu_count = std::thread::hardware_concurrency(); /// logical cores (with SMT/HyperThreading)
unsigned cores = std::thread::hardware_concurrency(); /// logical cores (with SMT/HyperThreading)
#if defined(__x86_64__) && defined(OS_LINUX)
/// Most x86_64 CPUs have 2-way SMT (Hyper-Threading).
/// Aarch64 and RISC-V don't have SMT so far.
/// POWER has SMT and it can be multi-way (e.g. 8-way), but we don't know how ClickHouse really behaves, so use all of them.
#if defined(__x86_64__) && defined(OS_LINUX)
///
/// On really big machines, SMT is detrimental to performance (+ ~5% overhead in ClickBench). On such machines, we limit ourself to the physical cores.
/// Few cores indicate it is a small machine, runs in a VM or is a limited cloud instance --> it is reasonable to use all the cores.
if (cpu_count >= 32)
cpu_count = physical_concurrency();
if (cores >= 32)
cores = physical_concurrency();
#endif
#if defined(OS_LINUX)
cpu_count = getCGroupLimitedCPUCores(cpu_count);
cores = getCGroupLimitedCPUCores(cores);
#endif
return cpu_count;
return cores;
}
}
@ -203,6 +203,6 @@ unsigned getNumberOfPhysicalCPUCoresImpl()
unsigned getNumberOfPhysicalCPUCores()
{
/// Calculate once.
static auto res = getNumberOfPhysicalCPUCoresImpl();
return res;
static auto cores = getNumberOfPhysicalCPUCoresImpl();
return cores;
}