mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-24 16:42:05 +00:00
Merge pull request #59435 from rschu1ze/nested-memory-cgroups
Consider nested cgroup memory limits for the total memory tracker
This commit is contained in:
commit
3af96c6970
@ -1,8 +1,11 @@
|
||||
#include <stdexcept>
|
||||
#include <fstream>
|
||||
#include <base/getMemoryAmount.h>
|
||||
|
||||
#include <base/getPageSize.h>
|
||||
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/param.h>
|
||||
@ -11,6 +14,80 @@
|
||||
#endif
|
||||
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
std::optional<uint64_t> getCgroupsV2MemoryLimit()
|
||||
{
|
||||
#if defined(OS_LINUX)
|
||||
const std::filesystem::path default_cgroups_mount = "/sys/fs/cgroup";
|
||||
|
||||
/// This file exists iff the host has cgroups v2 enabled.
|
||||
std::ifstream controllers_file(default_cgroups_mount / "cgroup.controllers");
|
||||
if (!controllers_file.is_open())
|
||||
return {};
|
||||
|
||||
/// Make sure that the memory controller is enabled.
|
||||
/// - cgroup.controllers defines which controllers *can* be enabled.
|
||||
/// - cgroup.subtree_control defines which controllers *are* enabled.
|
||||
/// (see https://docs.kernel.org/admin-guide/cgroup-v2.html)
|
||||
/// Caveat: nested groups may disable controllers. For simplicity, check only the top-level group.
|
||||
/// ReadBufferFromFile subtree_control_file(default_cgroups_mount / "cgroup.subtree_control");
|
||||
/// std::string subtree_control;
|
||||
/// readString(subtree_control, subtree_control_file);
|
||||
/// if (subtree_control.find("memory") == std::string::npos)
|
||||
/// return {};
|
||||
std::ifstream subtree_control_file(default_cgroups_mount / "cgroup.subtree_control");
|
||||
std::stringstream subtree_control_buf;
|
||||
subtree_control_buf << subtree_control_file.rdbuf();
|
||||
std::string subtree_control = subtree_control_buf.str();
|
||||
if (subtree_control.find("memory") == std::string::npos)
|
||||
return {};
|
||||
|
||||
/// Identify the cgroup the process belongs to
|
||||
/// All PIDs assigned to a cgroup are in /sys/fs/cgroups/{cgroup_name}/cgroup.procs
|
||||
/// A simpler way to get the membership is:
|
||||
std::ifstream cgroup_name_file("/proc/self/cgroup");
|
||||
if (!cgroup_name_file.is_open())
|
||||
return {};
|
||||
|
||||
std::stringstream cgroup_name_buf;
|
||||
cgroup_name_buf << cgroup_name_file.rdbuf();
|
||||
std::string cgroup_name = cgroup_name_buf.str();
|
||||
if (!cgroup_name.empty() && cgroup_name.back() == '\n')
|
||||
cgroup_name.pop_back(); /// remove trailing newline, if any
|
||||
/// With cgroups v2, there will be a *single* line with prefix "0::/"
|
||||
const std::string v2_prefix = "0::/";
|
||||
if (!cgroup_name.starts_with(v2_prefix))
|
||||
return {};
|
||||
cgroup_name = cgroup_name.substr(v2_prefix.length());
|
||||
|
||||
std::filesystem::path current_cgroup = cgroup_name.empty() ? default_cgroups_mount : (default_cgroups_mount / cgroup_name);
|
||||
|
||||
/// Open the bottom-most nested memory limit setting file. If there is no such file at the current
|
||||
/// level, try again at the parent level as memory settings are inherited.
|
||||
while (current_cgroup != default_cgroups_mount.parent_path())
|
||||
{
|
||||
std::ifstream setting_file(current_cgroup / "memory.max");
|
||||
if (setting_file.is_open())
|
||||
{
|
||||
uint64_t value;
|
||||
if (setting_file >> value)
|
||||
return {value};
|
||||
else
|
||||
return {}; /// e.g. the cgroups default "max"
|
||||
}
|
||||
current_cgroup = current_cgroup.parent_path();
|
||||
}
|
||||
|
||||
return {};
|
||||
#else
|
||||
return {};
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/** Returns the size of physical memory (RAM) in bytes.
|
||||
* Returns 0 on unsupported platform
|
||||
*/
|
||||
@ -26,34 +103,27 @@ uint64_t getMemoryAmountOrZero()
|
||||
|
||||
uint64_t memory_amount = num_pages * page_size;
|
||||
|
||||
#if defined(OS_LINUX)
|
||||
// Try to lookup at the Cgroup limit
|
||||
|
||||
// CGroups v2
|
||||
std::ifstream cgroupv2_limit("/sys/fs/cgroup/memory.max");
|
||||
if (cgroupv2_limit.is_open())
|
||||
{
|
||||
uint64_t memory_limit = 0;
|
||||
cgroupv2_limit >> memory_limit;
|
||||
if (memory_limit > 0 && memory_limit < memory_amount)
|
||||
memory_amount = memory_limit;
|
||||
}
|
||||
/// Respect the memory limit set by cgroups v2.
|
||||
auto limit_v2 = getCgroupsV2MemoryLimit();
|
||||
if (limit_v2.has_value() && *limit_v2 < memory_amount)
|
||||
memory_amount = *limit_v2;
|
||||
else
|
||||
{
|
||||
// CGroups v1
|
||||
std::ifstream cgroup_limit("/sys/fs/cgroup/memory/memory.limit_in_bytes");
|
||||
if (cgroup_limit.is_open())
|
||||
/// Cgroups v1 were replaced by v2 in 2015. The only reason we keep supporting v1 is that the transition to v2
|
||||
/// has been slow. Caveat : Hierarchical groups as in v2 are not supported for v1, the location of the memory
|
||||
/// limit (virtual) file is hard-coded.
|
||||
/// TODO: check at the end of 2024 if we can get rid of v1.
|
||||
std::ifstream limit_file_v1("/sys/fs/cgroup/memory/memory.limit_in_bytes");
|
||||
if (limit_file_v1.is_open())
|
||||
{
|
||||
uint64_t memory_limit = 0; // in case of read error
|
||||
cgroup_limit >> memory_limit;
|
||||
if (memory_limit > 0 && memory_limit < memory_amount)
|
||||
memory_amount = memory_limit;
|
||||
uint64_t limit_v1;
|
||||
if (limit_file_v1 >> limit_v1)
|
||||
if (limit_v1 < memory_amount)
|
||||
memory_amount = limit_v1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return memory_amount;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,7 +1,6 @@
|
||||
#include "getNumberOfPhysicalCPUCores.h"
|
||||
#include <filesystem>
|
||||
|
||||
#include "config.h"
|
||||
#if defined(OS_LINUX)
|
||||
# include <cmath>
|
||||
# include <fstream>
|
||||
@ -34,9 +33,9 @@ int32_t readFrom(const std::filesystem::path & filename, int default_value)
|
||||
uint32_t getCGroupLimitedCPUCores(unsigned default_cpu_count)
|
||||
{
|
||||
uint32_t quota_count = default_cpu_count;
|
||||
std::filesystem::path prefix = "/sys/fs/cgroup";
|
||||
std::filesystem::path default_cgroups_mount = "/sys/fs/cgroup";
|
||||
/// cgroupsv2
|
||||
std::ifstream contr_file(prefix / "cgroup.controllers");
|
||||
std::ifstream contr_file(default_cgroups_mount / "cgroup.controllers");
|
||||
if (contr_file.is_open())
|
||||
{
|
||||
/// First, we identify the cgroup the process belongs
|
||||
@ -51,16 +50,15 @@ uint32_t getCGroupLimitedCPUCores(unsigned default_cpu_count)
|
||||
|
||||
std::filesystem::path current_cgroup;
|
||||
if (cgroup_name.empty())
|
||||
current_cgroup = prefix;
|
||||
current_cgroup = default_cgroups_mount;
|
||||
else
|
||||
current_cgroup = prefix / cgroup_name;
|
||||
current_cgroup = default_cgroups_mount / cgroup_name;
|
||||
|
||||
// Looking for cpu.max in directories from the current cgroup to the top level
|
||||
// It does not stop on the first time since the child could have a greater value than parent
|
||||
while (current_cgroup != prefix.parent_path())
|
||||
while (current_cgroup != default_cgroups_mount.parent_path())
|
||||
{
|
||||
std::ifstream cpu_max_file(current_cgroup / "cpu.max");
|
||||
current_cgroup = current_cgroup.parent_path();
|
||||
if (cpu_max_file.is_open())
|
||||
{
|
||||
std::string cpu_limit_str;
|
||||
@ -72,10 +70,11 @@ uint32_t getCGroupLimitedCPUCores(unsigned default_cpu_count)
|
||||
quota_count = std::min(static_cast<uint32_t>(ceil(cpu_limit / cpu_period)), quota_count);
|
||||
}
|
||||
}
|
||||
current_cgroup = current_cgroup.parent_path();
|
||||
}
|
||||
current_cgroup = prefix / cgroup_name;
|
||||
current_cgroup = default_cgroups_mount / cgroup_name;
|
||||
// Looking for cpuset.cpus.effective in directories from the current cgroup to the top level
|
||||
while (current_cgroup != prefix.parent_path())
|
||||
while (current_cgroup != default_cgroups_mount.parent_path())
|
||||
{
|
||||
std::ifstream cpuset_cpus_file(current_cgroup / "cpuset.cpus.effective");
|
||||
current_cgroup = current_cgroup.parent_path();
|
||||
@ -113,8 +112,8 @@ uint32_t getCGroupLimitedCPUCores(unsigned default_cpu_count)
|
||||
/// cgroupsv1
|
||||
/// Return the number of milliseconds per period process is guaranteed to run.
|
||||
/// -1 for no quota
|
||||
int cgroup_quota = readFrom(prefix / "cpu/cpu.cfs_quota_us", -1);
|
||||
int cgroup_period = readFrom(prefix / "cpu/cpu.cfs_period_us", -1);
|
||||
int cgroup_quota = readFrom(default_cgroups_mount / "cpu/cpu.cfs_quota_us", -1);
|
||||
int cgroup_period = readFrom(default_cgroups_mount / "cpu/cpu.cfs_period_us", -1);
|
||||
if (cgroup_quota > -1 && cgroup_period > 0)
|
||||
quota_count = static_cast<uint32_t>(ceil(static_cast<float>(cgroup_quota) / static_cast<float>(cgroup_period)));
|
||||
|
||||
@ -178,24 +177,25 @@ catch (...)
|
||||
|
||||
unsigned getNumberOfPhysicalCPUCoresImpl()
|
||||
{
|
||||
unsigned cpu_count = std::thread::hardware_concurrency(); /// logical cores (with SMT/HyperThreading)
|
||||
unsigned cores = std::thread::hardware_concurrency(); /// logical cores (with SMT/HyperThreading)
|
||||
|
||||
|
||||
#if defined(__x86_64__) && defined(OS_LINUX)
|
||||
/// Most x86_64 CPUs have 2-way SMT (Hyper-Threading).
|
||||
/// Aarch64 and RISC-V don't have SMT so far.
|
||||
/// POWER has SMT and it can be multi-way (e.g. 8-way), but we don't know how ClickHouse really behaves, so use all of them.
|
||||
|
||||
#if defined(__x86_64__) && defined(OS_LINUX)
|
||||
///
|
||||
/// On really big machines, SMT is detrimental to performance (+ ~5% overhead in ClickBench). On such machines, we limit ourself to the physical cores.
|
||||
/// Few cores indicate it is a small machine, runs in a VM or is a limited cloud instance --> it is reasonable to use all the cores.
|
||||
if (cpu_count >= 32)
|
||||
cpu_count = physical_concurrency();
|
||||
if (cores >= 32)
|
||||
cores = physical_concurrency();
|
||||
#endif
|
||||
|
||||
#if defined(OS_LINUX)
|
||||
cpu_count = getCGroupLimitedCPUCores(cpu_count);
|
||||
cores = getCGroupLimitedCPUCores(cores);
|
||||
#endif
|
||||
|
||||
return cpu_count;
|
||||
return cores;
|
||||
}
|
||||
|
||||
}
|
||||
@ -203,6 +203,6 @@ unsigned getNumberOfPhysicalCPUCoresImpl()
|
||||
unsigned getNumberOfPhysicalCPUCores()
|
||||
{
|
||||
/// Calculate once.
|
||||
static auto res = getNumberOfPhysicalCPUCoresImpl();
|
||||
return res;
|
||||
static auto cores = getNumberOfPhysicalCPUCoresImpl();
|
||||
return cores;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user