Merge pull request #66648 from ClickHouse/read-cgroup-memory-usage-async-metrics

Simplify some memory tracking parts
This commit is contained in:
Antonio Andelic 2024-09-11 13:24:38 +00:00 committed by GitHub
commit 9b42af7263
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
28 changed files with 625 additions and 469 deletions

View File

@ -11,6 +11,7 @@
#include <Core/ServerUUID.h>
#include <Common/logger_useful.h>
#include <Common/CgroupsMemoryUsageObserver.h>
#include <Common/MemoryWorker.h>
#include <Common/ErrorHandlers.h>
#include <Common/assertProcessUserMatchesDataOwner.h>
#include <Common/makeSocketAddress.h>
@ -384,6 +385,9 @@ try
LOG_INFO(log, "Background threads finished in {} ms", watch.elapsedMilliseconds());
});
MemoryWorker memory_worker(config().getUInt64("memory_worker_period_ms", 0));
memory_worker.start();
static ServerErrorHandler error_handler;
Poco::ErrorHandler::set(&error_handler);
@ -425,8 +429,9 @@ try
for (const auto & server : *servers)
metrics.emplace_back(ProtocolServerMetrics{server.getPortName(), server.currentThreads(), server.refusedConnections()});
return metrics;
}
);
},
/*update_jemalloc_epoch_=*/memory_worker.getSource() != MemoryWorker::MemoryUsageSource::Jemalloc,
/*update_rss_=*/memory_worker.getSource() == MemoryWorker::MemoryUsageSource::None);
std::vector<std::string> listen_hosts = DB::getMultipleValuesFromConfig(config(), "", "listen_host");
@ -655,7 +660,6 @@ try
GWPAsan::initFinished();
#endif
LOG_INFO(log, "Ready for connections.");
waitForTerminationRequest();

View File

@ -11,7 +11,6 @@
#include <Poco/Util/HelpFormatter.h>
#include <Poco/Environment.h>
#include <Poco/Config.h>
#include <Common/Jemalloc.h>
#include <Common/scope_guard_safe.h>
#include <Common/logger_useful.h>
#include <base/phdr_cache.h>
@ -25,6 +24,7 @@
#include <base/Numa.h>
#include <Common/PoolId.h>
#include <Common/MemoryTracker.h>
#include <Common/MemoryWorker.h>
#include <Common/ClickHouseRevision.h>
#include <Common/DNSResolver.h>
#include <Common/CgroupsMemoryUsageObserver.h>
@ -111,6 +111,8 @@
#include <filesystem>
#include <unordered_set>
#include <Common/Jemalloc.h>
#include "config.h"
#include <Common/config_version.h>
@ -449,9 +451,12 @@ void checkForUsersNotInMainConfig(
}
}
namespace
{
/// Unused in other builds
#if defined(OS_LINUX)
static String readLine(const String & path)
String readLine(const String & path)
{
ReadBufferFromFile in(path);
String contents;
@ -459,7 +464,7 @@ static String readLine(const String & path)
return contents;
}
static int readNumber(const String & path)
int readNumber(const String & path)
{
ReadBufferFromFile in(path);
int result;
@ -469,7 +474,7 @@ static int readNumber(const String & path)
#endif
static void sanityChecks(Server & server)
void sanityChecks(Server & server)
{
std::string data_path = getCanonicalPath(server.config().getString("path", DBMS_DEFAULT_PATH));
std::string logs_path = server.config().getString("logger.log", "");
@ -590,6 +595,8 @@ static void sanityChecks(Server & server)
}
}
}
void loadStartupScripts(const Poco::Util::AbstractConfiguration & config, ContextMutablePtr context, Poco::Logger * log)
{
try
@ -906,6 +913,8 @@ try
LOG_INFO(log, "Background threads finished in {} ms", watch.elapsedMilliseconds());
});
MemoryWorker memory_worker(global_context->getServerSettings().memory_worker_period_ms);
/// This object will periodically calculate some metrics.
ServerAsynchronousMetrics async_metrics(
global_context,
@ -924,8 +933,9 @@ try
for (const auto & server : servers)
metrics.emplace_back(ProtocolServerMetrics{server.getPortName(), server.currentThreads(), server.refusedConnections()});
return metrics;
}
);
},
/*update_jemalloc_epoch_=*/memory_worker.getSource() != MemoryWorker::MemoryUsageSource::Jemalloc,
/*update_rss_=*/memory_worker.getSource() == MemoryWorker::MemoryUsageSource::None);
/// NOTE: global context should be destroyed *before* GlobalThreadPool::shutdown()
/// Otherwise GlobalThreadPool::shutdown() will hang, since Context holds some threads.
@ -1204,6 +1214,8 @@ try
FailPointInjection::enableFromGlobalConfig(config());
memory_worker.start();
int default_oom_score = 0;
#if !defined(NDEBUG)
@ -1547,15 +1559,6 @@ try
total_memory_tracker.setDescription("(total)");
total_memory_tracker.setMetric(CurrentMetrics::MemoryTracking);
if (cgroups_memory_usage_observer)
{
double hard_limit_ratio = new_server_settings.cgroup_memory_watcher_hard_limit_ratio;
double soft_limit_ratio = new_server_settings.cgroup_memory_watcher_soft_limit_ratio;
cgroups_memory_usage_observer->setMemoryUsageLimits(
static_cast<uint64_t>(max_server_memory_usage * hard_limit_ratio),
static_cast<uint64_t>(max_server_memory_usage * soft_limit_ratio));
}
size_t merges_mutations_memory_usage_soft_limit = new_server_settings.merges_mutations_memory_usage_soft_limit;
size_t default_merges_mutations_server_memory_usage = static_cast<size_t>(current_physical_server_memory * new_server_settings.merges_mutations_memory_usage_to_ram_ratio);
@ -1584,8 +1587,6 @@ try
background_memory_tracker.setDescription("(background)");
background_memory_tracker.setMetric(CurrentMetrics::MergesMutationsMemoryTracking);
total_memory_tracker.setAllowUseJemallocMemory(new_server_settings.allow_use_jemalloc_memory);
auto * global_overcommit_tracker = global_context->getGlobalOvercommitTracker();
total_memory_tracker.setOvercommitTracker(global_overcommit_tracker);

View File

@ -176,7 +176,7 @@ add_library (clickhouse_new_delete STATIC Common/new_delete.cpp)
target_link_libraries (clickhouse_new_delete PRIVATE clickhouse_common_io)
if (TARGET ch_contrib::jemalloc)
target_link_libraries (clickhouse_new_delete PRIVATE ch_contrib::jemalloc)
target_link_libraries (clickhouse_common_io PRIVATE ch_contrib::jemalloc)
target_link_libraries (clickhouse_common_io PUBLIC ch_contrib::jemalloc)
target_link_libraries (clickhouse_storages_system PRIVATE ch_contrib::jemalloc)
endif()

View File

@ -1,5 +1,3 @@
#include <Common/AsynchronousMetrics.h>
#include <IO/MMappedFileCache.h>
#include <IO/ReadHelpers.h>
#include <IO/UncompressedCache.h>
@ -8,8 +6,10 @@
#include <base/find_symbols.h>
#include <base/getPageSize.h>
#include <sys/resource.h>
#include <Common/AsynchronousMetrics.h>
#include <Common/CurrentMetrics.h>
#include <Common/Exception.h>
#include <Common/Jemalloc.h>
#include <Common/filesystemHelpers.h>
#include <Common/formatReadable.h>
#include <Common/logger_useful.h>
@ -69,10 +69,14 @@ static void openCgroupv2MetricFile(const std::string & filename, std::optional<R
AsynchronousMetrics::AsynchronousMetrics(
unsigned update_period_seconds,
const ProtocolServerMetricsFunc & protocol_server_metrics_func_)
const ProtocolServerMetricsFunc & protocol_server_metrics_func_,
bool update_jemalloc_epoch_,
bool update_rss_)
: update_period(update_period_seconds)
, log(getLogger("AsynchronousMetrics"))
, protocol_server_metrics_func(protocol_server_metrics_func_)
, update_jemalloc_epoch(update_jemalloc_epoch_)
, update_rss(update_rss_)
{
#if defined(OS_LINUX)
openFileIfExists("/proc/cpuinfo", cpuinfo);
@ -411,9 +415,7 @@ Value saveJemallocMetricImpl(
const std::string & jemalloc_full_name,
const std::string & clickhouse_full_name)
{
Value value{};
size_t size = sizeof(value);
mallctl(jemalloc_full_name.c_str(), &value, &size, nullptr, 0);
auto value = getJemallocValue<Value>(jemalloc_full_name.c_str());
values[clickhouse_full_name] = AsynchronousMetricValue(value, "An internal metric of the low-level memory allocator (jemalloc). See https://jemalloc.net/jemalloc.3.html");
return value;
}
@ -768,8 +770,11 @@ void AsynchronousMetrics::update(TimePoint update_time, bool force_update)
// 'epoch' is a special mallctl -- it updates the statistics. Without it, all
// the following calls will return stale values. It increments and returns
// the current epoch number, which might be useful to log as a sanity check.
auto epoch = updateJemallocEpoch();
new_values["jemalloc.epoch"] = { epoch, "An internal incremental update number of the statistics of jemalloc (Jason Evans' memory allocator), used in all other `jemalloc` metrics." };
auto epoch = update_jemalloc_epoch ? updateJemallocEpoch() : getJemallocValue<uint64_t>("epoch");
new_values["jemalloc.epoch"]
= {epoch,
"An internal incremental update number of the statistics of jemalloc (Jason Evans' memory allocator), used in all other "
"`jemalloc` metrics."};
// Collect the statistics themselves.
saveJemallocMetric<size_t>(new_values, "allocated");
@ -782,10 +787,10 @@ void AsynchronousMetrics::update(TimePoint update_time, bool force_update)
saveJemallocMetric<size_t>(new_values, "background_thread.num_threads");
saveJemallocMetric<uint64_t>(new_values, "background_thread.num_runs");
saveJemallocMetric<uint64_t>(new_values, "background_thread.run_intervals");
saveJemallocProf<size_t>(new_values, "active");
saveJemallocProf<bool>(new_values, "active");
saveAllArenasMetric<size_t>(new_values, "pactive");
[[maybe_unused]] size_t je_malloc_pdirty = saveAllArenasMetric<size_t>(new_values, "pdirty");
[[maybe_unused]] size_t je_malloc_pmuzzy = saveAllArenasMetric<size_t>(new_values, "pmuzzy");
saveAllArenasMetric<size_t>(new_values, "pdirty");
saveAllArenasMetric<size_t>(new_values, "pmuzzy");
saveAllArenasMetric<size_t>(new_values, "dirty_purged");
saveAllArenasMetric<size_t>(new_values, "muzzy_purged");
#endif
@ -814,41 +819,8 @@ void AsynchronousMetrics::update(TimePoint update_time, bool force_update)
" It is unspecified whether it includes the per-thread stacks and most of the allocated memory, that is allocated with the 'mmap' system call."
" This metric exists only for completeness reasons. I recommend to use the `MemoryResident` metric for monitoring."};
/// We must update the value of total_memory_tracker periodically.
/// Otherwise it might be calculated incorrectly - it can include a "drift" of memory amount.
/// See https://github.com/ClickHouse/ClickHouse/issues/10293
{
Int64 amount = total_memory_tracker.get();
Int64 peak = total_memory_tracker.getPeak();
Int64 rss = data.resident;
Int64 free_memory_in_allocator_arenas = 0;
#if USE_JEMALLOC
/// According to jemalloc man, pdirty is:
///
/// Number of pages within unused extents that are potentially
/// dirty, and for which madvise() or similar has not been called.
///
/// So they will be subtracted from RSS to make accounting more
/// accurate, since those pages are not really RSS but a memory
/// that can be used at anytime via jemalloc.
free_memory_in_allocator_arenas = je_malloc_pdirty * getPageSize();
#endif
Int64 difference = rss - amount;
/// Log only if difference is high. This is for convenience. The threshold is arbitrary.
if (difference >= 1048576 || difference <= -1048576)
LOG_TRACE(log,
"MemoryTracking: was {}, peak {}, free memory in arenas {}, will set to {} (RSS), difference: {}",
ReadableSize(amount),
ReadableSize(peak),
ReadableSize(free_memory_in_allocator_arenas),
ReadableSize(rss),
ReadableSize(difference));
MemoryTracker::setRSS(rss, free_memory_in_allocator_arenas);
}
if (update_rss)
MemoryTracker::updateRSS(data.resident);
}
{

View File

@ -1,15 +1,14 @@
#pragma once
#include <Common/CgroupsMemoryUsageObserver.h>
#include <Common/MemoryStatisticsOS.h>
#include <Common/ThreadPool.h>
#include <Common/Stopwatch.h>
#include <IO/ReadBufferFromFile.h>
#include <condition_variable>
#include <map>
#include <mutex>
#include <string>
#include <thread>
#include <vector>
#include <optional>
#include <unordered_map>
@ -69,7 +68,9 @@ public:
AsynchronousMetrics(
unsigned update_period_seconds,
const ProtocolServerMetricsFunc & protocol_server_metrics_func_);
const ProtocolServerMetricsFunc & protocol_server_metrics_func_,
bool update_jemalloc_epoch_,
bool update_rss_);
virtual ~AsynchronousMetrics();
@ -112,6 +113,9 @@ private:
MemoryStatisticsOS memory_stat TSA_GUARDED_BY(data_mutex);
#endif
[[maybe_unused]] const bool update_jemalloc_epoch;
[[maybe_unused]] const bool update_rss;
#if defined(OS_LINUX)
std::optional<ReadBufferFromFilePRead> meminfo TSA_GUARDED_BY(data_mutex);
std::optional<ReadBufferFromFilePRead> loadavg TSA_GUARDED_BY(data_mutex);

View File

@ -14,239 +14,21 @@
#include <fmt/ranges.h>
#include <cstdint>
#include <filesystem>
#include <memory>
#include <optional>
#include "config.h"
#if USE_JEMALLOC
# include <jemalloc/jemalloc.h>
#define STRINGIFY_HELPER(x) #x
#define STRINGIFY(x) STRINGIFY_HELPER(x)
#endif
using namespace DB;
namespace fs = std::filesystem;
namespace DB
{
namespace ErrorCodes
{
extern const int FILE_DOESNT_EXIST;
extern const int INCORRECT_DATA;
}
}
namespace
{
/// Format is
/// kernel 5
/// rss 15
/// [...]
using Metrics = std::map<std::string, uint64_t>;
Metrics readAllMetricsFromStatFile(ReadBufferFromFile & buf)
{
Metrics metrics;
while (!buf.eof())
{
std::string current_key;
readStringUntilWhitespace(current_key, buf);
assertChar(' ', buf);
uint64_t value = 0;
readIntText(value, buf);
assertChar('\n', buf);
auto [_, inserted] = metrics.emplace(std::move(current_key), value);
chassert(inserted, "Duplicate keys in stat file");
}
return metrics;
}
uint64_t readMetricFromStatFile(ReadBufferFromFile & buf, const std::string & key)
{
const auto all_metrics = readAllMetricsFromStatFile(buf);
if (const auto it = all_metrics.find(key); it != all_metrics.end())
return it->second;
throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot find '{}' in '{}'", key, buf.getFileName());
}
struct CgroupsV1Reader : ICgroupsReader
{
explicit CgroupsV1Reader(const fs::path & stat_file_dir) : buf(stat_file_dir / "memory.stat") { }
uint64_t readMemoryUsage() override
{
std::lock_guard lock(mutex);
buf.rewind();
return readMetricFromStatFile(buf, "rss");
}
std::string dumpAllStats() override
{
std::lock_guard lock(mutex);
buf.rewind();
return fmt::format("{}", readAllMetricsFromStatFile(buf));
}
private:
std::mutex mutex;
ReadBufferFromFile buf TSA_GUARDED_BY(mutex);
};
struct CgroupsV2Reader : ICgroupsReader
{
explicit CgroupsV2Reader(const fs::path & stat_file_dir)
: current_buf(stat_file_dir / "memory.current"), stat_buf(stat_file_dir / "memory.stat")
{
}
uint64_t readMemoryUsage() override
{
std::lock_guard lock(mutex);
current_buf.rewind();
stat_buf.rewind();
int64_t mem_usage = 0;
/// memory.current contains a single number
/// the reason why we subtract it described here: https://github.com/ClickHouse/ClickHouse/issues/64652#issuecomment-2149630667
readIntText(mem_usage, current_buf);
mem_usage -= readMetricFromStatFile(stat_buf, "inactive_file");
chassert(mem_usage >= 0, "Negative memory usage");
return mem_usage;
}
std::string dumpAllStats() override
{
std::lock_guard lock(mutex);
stat_buf.rewind();
return fmt::format("{}", readAllMetricsFromStatFile(stat_buf));
}
private:
std::mutex mutex;
ReadBufferFromFile current_buf TSA_GUARDED_BY(mutex);
ReadBufferFromFile stat_buf TSA_GUARDED_BY(mutex);
};
/// Caveats:
/// - All of the logic in this file assumes that the current process is the only process in the
/// containing cgroup (or more precisely: the only process with significant memory consumption).
/// If this is not the case, then other processe's memory consumption may affect the internal
/// memory tracker ...
/// - Cgroups v1 and v2 allow nested cgroup hierarchies. As v1 is deprecated for over half a
/// decade and will go away at some point, hierarchical detection is only implemented for v2.
/// - I did not test what happens if a host has v1 and v2 simultaneously enabled. I believe such
/// systems existed only for a short transition period.
std::optional<std::string> getCgroupsV1Path()
{
auto path = default_cgroups_mount / "memory/memory.stat";
if (!fs::exists(path))
return {};
return {default_cgroups_mount / "memory"};
}
std::pair<std::string, CgroupsMemoryUsageObserver::CgroupsVersion> getCgroupsPath()
{
auto v2_path = getCgroupsV2PathContainingFile("memory.current");
if (v2_path.has_value())
return {*v2_path, CgroupsMemoryUsageObserver::CgroupsVersion::V2};
auto v1_path = getCgroupsV1Path();
if (v1_path.has_value())
return {*v1_path, CgroupsMemoryUsageObserver::CgroupsVersion::V1};
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Cannot find cgroups v1 or v2 current memory file");
}
}
namespace DB
{
CgroupsMemoryUsageObserver::CgroupsMemoryUsageObserver(std::chrono::seconds wait_time_)
: log(getLogger("CgroupsMemoryUsageObserver")), wait_time(wait_time_)
{
const auto [cgroup_path, version] = getCgroupsPath();
cgroup_reader = createCgroupsReader(version, cgroup_path);
LOG_INFO(
log,
"Will read the current memory usage from '{}' (cgroups version: {}), wait time is {} sec",
cgroup_path,
(version == CgroupsVersion::V1) ? "v1" : "v2",
wait_time.count());
}
{}
CgroupsMemoryUsageObserver::~CgroupsMemoryUsageObserver()
{
stopThread();
}
void CgroupsMemoryUsageObserver::setMemoryUsageLimits(uint64_t hard_limit_, uint64_t soft_limit_)
{
std::lock_guard<std::mutex> limit_lock(limit_mutex);
if (hard_limit_ == hard_limit && soft_limit_ == soft_limit)
return;
hard_limit = hard_limit_;
soft_limit = soft_limit_;
on_hard_limit = [this, hard_limit_](bool up)
{
if (up)
{
LOG_WARNING(log, "Exceeded hard memory limit ({})", ReadableSize(hard_limit_));
/// Update current usage in memory tracker. Also reset free_memory_in_allocator_arenas to zero though we don't know if they are
/// really zero. Trying to avoid OOM ...
MemoryTracker::setRSS(hard_limit_, 0);
}
else
{
LOG_INFO(log, "Dropped below hard memory limit ({})", ReadableSize(hard_limit_));
}
};
on_soft_limit = [this, soft_limit_](bool up)
{
if (up)
{
LOG_WARNING(log, "Exceeded soft memory limit ({})", ReadableSize(soft_limit_));
# if USE_JEMALLOC
LOG_INFO(log, "Purging jemalloc arenas");
mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".purge", nullptr, nullptr, nullptr, 0);
# endif
/// Reset current usage in memory tracker. Expect zero for free_memory_in_allocator_arenas as we just purged them.
uint64_t memory_usage = cgroup_reader->readMemoryUsage();
LOG_TRACE(
log,
"Read current memory usage {} bytes ({}) from cgroups, full available stats: {}",
memory_usage,
ReadableSize(memory_usage),
cgroup_reader->dumpAllStats());
MemoryTracker::setRSS(memory_usage, 0);
LOG_INFO(log, "Purged jemalloc arenas. Current memory usage is {}", ReadableSize(memory_usage));
}
else
{
LOG_INFO(log, "Dropped below soft memory limit ({})", ReadableSize(soft_limit_));
}
};
LOG_INFO(log, "Set new limits, soft limit: {}, hard limit: {}", ReadableSize(soft_limit_), ReadableSize(hard_limit_));
}
void CgroupsMemoryUsageObserver::setOnMemoryAmountAvailableChangedFn(OnMemoryAmountAvailableChangedFn on_memory_amount_available_changed_)
{
std::lock_guard<std::mutex> memory_amount_available_changed_lock(memory_amount_available_changed_mutex);
@ -300,35 +82,6 @@ void CgroupsMemoryUsageObserver::runThread()
std::lock_guard<std::mutex> memory_amount_available_changed_lock(memory_amount_available_changed_mutex);
on_memory_amount_available_changed();
}
std::lock_guard<std::mutex> limit_lock(limit_mutex);
if (soft_limit > 0 && hard_limit > 0)
{
uint64_t memory_usage = cgroup_reader->readMemoryUsage();
LOG_TRACE(log, "Read current memory usage {} bytes ({}) from cgroups", memory_usage, ReadableSize(memory_usage));
if (memory_usage > hard_limit)
{
if (last_memory_usage <= hard_limit)
on_hard_limit(true);
}
else
{
if (last_memory_usage > hard_limit)
on_hard_limit(false);
}
if (memory_usage > soft_limit)
{
if (last_memory_usage <= soft_limit)
on_soft_limit(true);
}
else
{
if (last_memory_usage > soft_limit)
on_soft_limit(false);
}
last_memory_usage = memory_usage;
}
}
catch (...)
{
@ -337,13 +90,6 @@ void CgroupsMemoryUsageObserver::runThread()
}
}
std::unique_ptr<ICgroupsReader> createCgroupsReader(CgroupsMemoryUsageObserver::CgroupsVersion version, const fs::path & cgroup_path)
{
if (version == CgroupsMemoryUsageObserver::CgroupsVersion::V2)
return std::make_unique<CgroupsV2Reader>(cgroup_path);
else
return std::make_unique<CgroupsV1Reader>(cgroup_path);
}
}
#endif

View File

@ -3,30 +3,12 @@
#include <Common/ThreadPool.h>
#include <chrono>
#include <memory>
#include <mutex>
namespace DB
{
struct ICgroupsReader
{
virtual ~ICgroupsReader() = default;
virtual uint64_t readMemoryUsage() = 0;
virtual std::string dumpAllStats() = 0;
};
/// Does two things:
/// 1. Periodically reads the memory usage of the process from Linux cgroups.
/// You can specify soft or hard memory limits:
/// - When the soft memory limit is hit, drop jemalloc cache.
/// - When the hard memory limit is hit, update MemoryTracking metric to throw memory exceptions faster.
/// The goal of this is to avoid that the process hits the maximum allowed memory limit at which there is a good
/// chance that the Limux OOM killer terminates it. All of this is done is because internal memory tracking in
/// ClickHouse can unfortunately under-estimate the actually used memory.
/// 2. Periodically reads the the maximum memory available to the process (which can change due to cgroups settings).
/// Periodically reads the the maximum memory available to the process (which can change due to cgroups settings).
/// You can specify a callback to react on changes. The callback typically reloads the configuration, i.e. Server
/// or Keeper configuration file. This reloads settings 'max_server_memory_usage' (Server) and 'max_memory_usage_soft_limit'
/// (Keeper) from which various other internal limits are calculated, including the soft and hard limits for (1.).
@ -37,19 +19,11 @@ struct ICgroupsReader
class CgroupsMemoryUsageObserver
{
public:
using OnMemoryLimitFn = std::function<void(bool)>;
using OnMemoryAmountAvailableChangedFn = std::function<void()>;
enum class CgroupsVersion : uint8_t
{
V1,
V2
};
explicit CgroupsMemoryUsageObserver(std::chrono::seconds wait_time_);
~CgroupsMemoryUsageObserver();
void setMemoryUsageLimits(uint64_t hard_limit_, uint64_t soft_limit_);
void setOnMemoryAmountAvailableChangedFn(OnMemoryAmountAvailableChangedFn on_memory_amount_available_changed_);
void startThread();
@ -60,32 +34,22 @@ private:
const std::chrono::seconds wait_time;
std::mutex limit_mutex;
size_t hard_limit TSA_GUARDED_BY(limit_mutex) = 0;
size_t soft_limit TSA_GUARDED_BY(limit_mutex) = 0;
OnMemoryLimitFn on_hard_limit TSA_GUARDED_BY(limit_mutex);
OnMemoryLimitFn on_soft_limit TSA_GUARDED_BY(limit_mutex);
std::mutex memory_amount_available_changed_mutex;
OnMemoryAmountAvailableChangedFn on_memory_amount_available_changed TSA_GUARDED_BY(memory_amount_available_changed_mutex);
uint64_t last_memory_usage = 0; /// how much memory does the process use
uint64_t last_available_memory_amount; /// how much memory can the process use
void stopThread();
void runThread();
std::unique_ptr<ICgroupsReader> cgroup_reader;
std::mutex thread_mutex;
std::condition_variable cond;
ThreadFromGlobalPool thread;
bool quit = false;
};
std::unique_ptr<ICgroupsReader>
createCgroupsReader(CgroupsMemoryUsageObserver::CgroupsVersion version, const std::filesystem::path & cgroup_path);
#else
class CgroupsMemoryUsageObserver
{

View File

@ -5,7 +5,6 @@
#include <Common/Exception.h>
#include <Common/Stopwatch.h>
#include <Common/logger_useful.h>
#include <jemalloc/jemalloc.h>
#define STRINGIFY_HELPER(x) #x
#define STRINGIFY(x) STRINGIFY_HELPER(x)
@ -26,7 +25,6 @@ namespace ErrorCodes
void purgeJemallocArenas()
{
LOG_TRACE(getLogger("SystemJemalloc"), "Purging unused memory");
Stopwatch watch;
mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".purge", nullptr, nullptr, nullptr, 0);
ProfileEvents::increment(ProfileEvents::MemoryAllocatorPurge);
@ -46,20 +44,6 @@ void checkJemallocProfilingEnabled()
"set: MALLOC_CONF=background_thread:true,prof:true");
}
template <typename T>
void setJemallocValue(const char * name, T value)
{
T old_value;
size_t old_value_size = sizeof(T);
if (mallctl(name, &old_value, &old_value_size, reinterpret_cast<void*>(&value), sizeof(T)))
{
LOG_WARNING(getLogger("Jemalloc"), "mallctl for {} failed", name);
return;
}
LOG_INFO(getLogger("Jemalloc"), "Value for {} set to {} (from {})", name, value, old_value);
}
void setJemallocProfileActive(bool value)
{
checkJemallocProfilingEnabled();

View File

@ -5,6 +5,8 @@
#if USE_JEMALLOC
#include <string>
#include <Common/logger_useful.h>
#include <jemalloc/jemalloc.h>
namespace DB
{
@ -21,6 +23,59 @@ void setJemallocBackgroundThreads(bool enabled);
void setJemallocMaxBackgroundThreads(size_t max_threads);
template <typename T>
void setJemallocValue(const char * name, T value)
{
T old_value;
size_t old_value_size = sizeof(T);
mallctl(name, &old_value, &old_value_size, reinterpret_cast<void*>(&value), sizeof(T));
LOG_INFO(getLogger("Jemalloc"), "Value for {} set to {} (from {})", name, value, old_value);
}
template <typename T>
T getJemallocValue(const char * name)
{
T value;
size_t value_size = sizeof(T);
mallctl(name, &value, &value_size, nullptr, 0);
return value;
}
/// Each mallctl call consists of string name lookup which can be expensive.
/// This can be avoided by translating name to "Management Information Base" (MIB)
/// and using it in mallctlbymib calls
template <typename T>
struct JemallocMibCache
{
explicit JemallocMibCache(const char * name)
{
mallctlnametomib(name, mib, &mib_length);
}
void setValue(T value)
{
mallctlbymib(mib, mib_length, nullptr, nullptr, reinterpret_cast<void*>(&value), sizeof(T));
}
T getValue()
{
T value;
size_t value_size = sizeof(T);
mallctlbymib(mib, mib_length, &value, &value_size, nullptr, 0);
return value;
}
void run()
{
mallctlbymib(mib, mib_length, nullptr, nullptr, nullptr, 0);
}
private:
static constexpr size_t max_mib_length = 4;
size_t mib[max_mib_length];
size_t mib_length = max_mib_length;
};
}
#endif

View File

@ -20,13 +20,9 @@
#if USE_JEMALLOC
# include <jemalloc/jemalloc.h>
#define STRINGIFY_HELPER(x) #x
#define STRINGIFY(x) STRINGIFY_HELPER(x)
#endif
#include <atomic>
#include <cmath>
#include <random>
#include <cstdlib>
#include <string>
@ -115,8 +111,6 @@ void AllocationTrace::onFreeImpl(void * ptr, size_t size) const
namespace ProfileEvents
{
extern const Event QueryMemoryLimitExceeded;
extern const Event MemoryAllocatorPurge;
extern const Event MemoryAllocatorPurgeTimeMicroseconds;
}
using namespace std::chrono_literals;
@ -126,15 +120,13 @@ static constexpr size_t log_peak_memory_usage_every = 1ULL << 30;
MemoryTracker total_memory_tracker(nullptr, VariableContext::Global);
MemoryTracker background_memory_tracker(&total_memory_tracker, VariableContext::User, false);
std::atomic<Int64> MemoryTracker::free_memory_in_allocator_arenas;
MemoryTracker::MemoryTracker(VariableContext level_) : parent(&total_memory_tracker), level(level_) {}
MemoryTracker::MemoryTracker(MemoryTracker * parent_, VariableContext level_) : parent(parent_), level(level_) {}
MemoryTracker::MemoryTracker(MemoryTracker * parent_, VariableContext level_, bool log_peak_memory_usage_in_destructor_)
: parent(parent_)
, log_peak_memory_usage_in_destructor(log_peak_memory_usage_in_destructor_)
, level(level_)
{}
: parent(parent_), log_peak_memory_usage_in_destructor(log_peak_memory_usage_in_destructor_), level(level_)
{
}
MemoryTracker::~MemoryTracker()
{
@ -204,10 +196,14 @@ void MemoryTracker::debugLogBigAllocationWithoutCheck(Int64 size [[maybe_unused]
return;
MemoryTrackerBlockerInThread blocker(VariableContext::Global);
LOG_TEST(getLogger("MemoryTracker"), "Too big allocation ({} bytes) without checking memory limits, "
"it may lead to OOM. Stack trace: {}", size, StackTrace().toString());
LOG_TEST(
getLogger("MemoryTracker"),
"Too big allocation ({} bytes) without checking memory limits, "
"it may lead to OOM. Stack trace: {}",
size,
StackTrace().toString());
#else
return; /// Avoid trash logging in release builds
/// Avoid trash logging in release builds
#endif
}
@ -228,6 +224,7 @@ AllocationTrace MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceed
{
/// For global memory tracker always update memory usage.
amount.fetch_add(size, std::memory_order_relaxed);
rss.fetch_add(size, std::memory_order_relaxed);
auto metric_loaded = metric.load(std::memory_order_relaxed);
if (metric_loaded != CurrentMetrics::end())
@ -249,6 +246,7 @@ AllocationTrace MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceed
* So, we allow over-allocations.
*/
Int64 will_be = size ? size + amount.fetch_add(size, std::memory_order_relaxed) : amount.load(std::memory_order_relaxed);
Int64 will_be_rss = size ? size + rss.fetch_add(size, std::memory_order_relaxed) : rss.load(std::memory_order_relaxed);
auto metric_loaded = metric.load(std::memory_order_relaxed);
if (metric_loaded != CurrentMetrics::end() && size)
@ -275,6 +273,7 @@ AllocationTrace MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceed
{
/// Revert
amount.fetch_sub(size, std::memory_order_relaxed);
rss.fetch_sub(size, std::memory_order_relaxed);
/// Prevent recursion. Exception::ctor -> std::string -> new[] -> MemoryTracker::alloc
MemoryTrackerBlockerInThread untrack_lock(VariableContext::Global);
@ -297,33 +296,8 @@ AllocationTrace MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceed
}
}
Int64 limit_to_check = current_hard_limit;
#if USE_JEMALLOC
if (level == VariableContext::Global && allow_use_jemalloc_memory.load(std::memory_order_relaxed))
{
/// Jemalloc arenas may keep some extra memory.
/// This memory was substucted from RSS to decrease memory drift.
/// In case memory is close to limit, try to pugre the arenas.
/// This is needed to avoid OOM, because some allocations are directly done with mmap.
Int64 current_free_memory_in_allocator_arenas = free_memory_in_allocator_arenas.load(std::memory_order_relaxed);
if (current_free_memory_in_allocator_arenas > 0 && current_hard_limit && current_free_memory_in_allocator_arenas + will_be > current_hard_limit)
{
if (free_memory_in_allocator_arenas.exchange(-current_free_memory_in_allocator_arenas) > 0)
{
Stopwatch watch;
mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".purge", nullptr, nullptr, nullptr, 0);
ProfileEvents::increment(ProfileEvents::MemoryAllocatorPurge);
ProfileEvents::increment(ProfileEvents::MemoryAllocatorPurgeTimeMicroseconds, watch.elapsedMicroseconds());
}
}
limit_to_check += abs(current_free_memory_in_allocator_arenas);
}
#endif
if (unlikely(current_hard_limit && will_be > limit_to_check))
if (unlikely(
current_hard_limit && (will_be > current_hard_limit || (level == VariableContext::Global && will_be_rss > current_hard_limit))))
{
if (memoryTrackerCanThrow(level, false) && throw_if_memory_exceeded)
{
@ -335,6 +309,7 @@ AllocationTrace MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceed
{
/// Revert
amount.fetch_sub(size, std::memory_order_relaxed);
rss.fetch_sub(size, std::memory_order_relaxed);
/// Prevent recursion. Exception::ctor -> std::string -> new[] -> MemoryTracker::alloc
MemoryTrackerBlockerInThread untrack_lock(VariableContext::Global);
@ -343,12 +318,13 @@ AllocationTrace MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceed
throw DB::Exception(
DB::ErrorCodes::MEMORY_LIMIT_EXCEEDED,
"Memory limit{}{} exceeded: "
"would use {} (attempt to allocate chunk of {} bytes), maximum: {}."
"would use {} (attempt to allocate chunk of {} bytes), current RSS {}, maximum: {}."
"{}{}",
description ? " " : "",
description ? description : "",
formatReadableSizeWithBinarySuffix(will_be),
size,
formatReadableSizeWithBinarySuffix(rss.load(std::memory_order_relaxed)),
formatReadableSizeWithBinarySuffix(current_hard_limit),
overcommit_result == OvercommitResult::NONE ? "" : " OvercommitTracker decision: ",
toDescription(overcommit_result));
@ -442,6 +418,7 @@ AllocationTrace MemoryTracker::free(Int64 size, double _sample_probability)
{
/// For global memory tracker always update memory usage.
amount.fetch_sub(size, std::memory_order_relaxed);
rss.fetch_sub(size, std::memory_order_relaxed);
auto metric_loaded = metric.load(std::memory_order_relaxed);
if (metric_loaded != CurrentMetrics::end())
CurrentMetrics::sub(metric_loaded, size);
@ -455,7 +432,12 @@ AllocationTrace MemoryTracker::free(Int64 size, double _sample_probability)
}
Int64 accounted_size = size;
if (level == VariableContext::Thread || level == VariableContext::Global)
if (level == VariableContext::Global)
{
amount.fetch_sub(accounted_size, std::memory_order_relaxed);
rss.fetch_sub(accounted_size, std::memory_order_relaxed);
}
else if (level == VariableContext::Thread)
{
/// Could become negative if memory allocated in this thread is freed in another one
amount.fetch_sub(accounted_size, std::memory_order_relaxed);
@ -529,21 +511,29 @@ void MemoryTracker::reset()
}
void MemoryTracker::setRSS(Int64 rss_, Int64 free_memory_in_allocator_arenas_)
void MemoryTracker::updateRSS(Int64 rss_)
{
Int64 new_amount = rss_;
total_memory_tracker.rss.store(rss_, std::memory_order_relaxed);
}
void MemoryTracker::updateAllocated(Int64 allocated_)
{
Int64 new_amount = allocated_;
LOG_INFO(
getLogger("MemoryTracker"),
"Correcting the value of global memory tracker from {} to {}",
ReadableSize(total_memory_tracker.amount.load(std::memory_order_relaxed)),
ReadableSize(allocated_));
total_memory_tracker.amount.store(new_amount, std::memory_order_relaxed);
free_memory_in_allocator_arenas.store(free_memory_in_allocator_arenas_, std::memory_order_relaxed);
auto metric_loaded = total_memory_tracker.metric.load(std::memory_order_relaxed);
if (metric_loaded != CurrentMetrics::end())
CurrentMetrics::set(metric_loaded, new_amount);
bool log_memory_usage = true;
total_memory_tracker.updatePeak(rss_, log_memory_usage);
total_memory_tracker.updatePeak(new_amount, log_memory_usage);
}
void MemoryTracker::setSoftLimit(Int64 value)
{
soft_limit.store(value, std::memory_order_relaxed);

View File

@ -2,7 +2,6 @@
#include <atomic>
#include <chrono>
#include <optional>
#include <base/types.h>
#include <Common/CurrentMetrics.h>
#include <Common/VariableContext.h>
@ -57,9 +56,8 @@ private:
std::atomic<Int64> soft_limit {0};
std::atomic<Int64> hard_limit {0};
std::atomic<Int64> profiler_limit {0};
std::atomic_bool allow_use_jemalloc_memory {true};
static std::atomic<Int64> free_memory_in_allocator_arenas;
std::atomic<Int64> rss{0};
Int64 profiler_step = 0;
@ -122,6 +120,11 @@ public:
return amount.load(std::memory_order_relaxed);
}
Int64 getRSS() const
{
return rss.load(std::memory_order_relaxed);
}
// Merges and mutations may pass memory ownership to other threads thus in the end of execution
// MemoryTracker for background task may have a non-zero counter.
// This method is intended to fix the counter inside of background_memory_tracker.
@ -154,14 +157,6 @@ public:
{
return soft_limit.load(std::memory_order_relaxed);
}
void setAllowUseJemallocMemory(bool value)
{
allow_use_jemalloc_memory.store(value, std::memory_order_relaxed);
}
bool getAllowUseJemallocMmemory() const
{
return allow_use_jemalloc_memory.load(std::memory_order_relaxed);
}
/** Set limit if it was not set.
* Otherwise, set limit to new value, if new value is greater than previous limit.
@ -249,10 +244,9 @@ public:
/// Reset the accumulated data.
void reset();
/// Reset current counter to an RSS value.
/// Jemalloc may have pre-allocated arenas, they are accounted in RSS.
/// We can free this arenas in case of exception to avoid OOM.
static void setRSS(Int64 rss_, Int64 free_memory_in_allocator_arenas_);
/// update values based on external information (e.g. jemalloc's stat)
static void updateRSS(Int64 rss_);
static void updateAllocated(Int64 allocated_);
/// Prints info about peak memory consumption into log.
void logPeakMemoryUsage();

333
src/Common/MemoryWorker.cpp Normal file
View File

@ -0,0 +1,333 @@
#include <Common/MemoryWorker.h>
#include <IO/ReadBufferFromFile.h>
#include <IO/ReadBufferFromFileDescriptor.h>
#include <IO/ReadHelpers.h>
#include <base/cgroupsv2.h>
#include <Common/Jemalloc.h>
#include <Common/MemoryTracker.h>
#include <Common/ProfileEvents.h>
#include <Common/formatReadable.h>
#include <Common/logger_useful.h>
#include <fmt/ranges.h>
#include <filesystem>
#include <optional>
namespace fs = std::filesystem;
namespace ProfileEvents
{
extern const Event MemoryAllocatorPurge;
extern const Event MemoryAllocatorPurgeTimeMicroseconds;
extern const Event MemoryWorkerRun;
extern const Event MemoryWorkerRunElapsedMicroseconds;
}
namespace DB
{
namespace ErrorCodes
{
extern const int FILE_DOESNT_EXIST;
extern const int LOGICAL_ERROR;
}
#if defined(OS_LINUX)
namespace
{
using Metrics = std::map<std::string, uint64_t>;
/// Format is
/// kernel 5
/// rss 15
/// [...]
Metrics readAllMetricsFromStatFile(ReadBufferFromFile & buf)
{
Metrics metrics;
while (!buf.eof())
{
std::string current_key;
readStringUntilWhitespace(current_key, buf);
assertChar(' ', buf);
uint64_t value = 0;
readIntText(value, buf);
assertChar('\n', buf);
auto [_, inserted] = metrics.emplace(std::move(current_key), value);
chassert(inserted, "Duplicate keys in stat file");
}
return metrics;
}
uint64_t readMetricFromStatFile(ReadBufferFromFile & buf, std::string_view key)
{
while (!buf.eof())
{
std::string current_key;
readStringUntilWhitespace(current_key, buf);
if (current_key != key)
{
std::string dummy;
readStringUntilNewlineInto(dummy, buf);
buf.ignore();
continue;
}
assertChar(' ', buf);
uint64_t value = 0;
readIntText(value, buf);
return value;
}
LOG_ERROR(getLogger("CgroupsReader"), "Cannot find '{}' in '{}'", key, buf.getFileName());
return 0;
}
struct CgroupsV1Reader : ICgroupsReader
{
explicit CgroupsV1Reader(const fs::path & stat_file_dir) : buf(stat_file_dir / "memory.stat") { }
uint64_t readMemoryUsage() override
{
std::lock_guard lock(mutex);
buf.rewind();
return readMetricFromStatFile(buf, "rss");
}
std::string dumpAllStats() override
{
std::lock_guard lock(mutex);
buf.rewind();
return fmt::format("{}", readAllMetricsFromStatFile(buf));
}
private:
std::mutex mutex;
ReadBufferFromFile buf TSA_GUARDED_BY(mutex);
};
struct CgroupsV2Reader : ICgroupsReader
{
explicit CgroupsV2Reader(const fs::path & stat_file_dir) : stat_buf(stat_file_dir / "memory.stat") { }
uint64_t readMemoryUsage() override
{
std::lock_guard lock(mutex);
stat_buf.rewind();
return readMetricFromStatFile(stat_buf, "anon");
}
std::string dumpAllStats() override
{
std::lock_guard lock(mutex);
stat_buf.rewind();
return fmt::format("{}", readAllMetricsFromStatFile(stat_buf));
}
private:
std::mutex mutex;
ReadBufferFromFile stat_buf TSA_GUARDED_BY(mutex);
};
/// Caveats:
/// - All of the logic in this file assumes that the current process is the only process in the
/// containing cgroup (or more precisely: the only process with significant memory consumption).
/// If this is not the case, then other processe's memory consumption may affect the internal
/// memory tracker ...
/// - Cgroups v1 and v2 allow nested cgroup hierarchies. As v1 is deprecated for over half a
/// decade and will go away at some point, hierarchical detection is only implemented for v2.
/// - I did not test what happens if a host has v1 and v2 simultaneously enabled. I believe such
/// systems existed only for a short transition period.
std::optional<std::string> getCgroupsV1Path()
{
auto path = default_cgroups_mount / "memory/memory.stat";
if (!fs::exists(path))
return {};
return {default_cgroups_mount / "memory"};
}
std::pair<std::string, ICgroupsReader::CgroupsVersion> getCgroupsPath()
{
auto v2_path = getCgroupsV2PathContainingFile("memory.current");
if (v2_path.has_value())
return {*v2_path, ICgroupsReader::CgroupsVersion::V2};
auto v1_path = getCgroupsV1Path();
if (v1_path.has_value())
return {*v1_path, ICgroupsReader::CgroupsVersion::V1};
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Cannot find cgroups v1 or v2 current memory file");
}
}
std::shared_ptr<ICgroupsReader> ICgroupsReader::createCgroupsReader(ICgroupsReader::CgroupsVersion version, const std::filesystem::path & cgroup_path)
{
if (version == CgroupsVersion::V2)
return std::make_shared<CgroupsV2Reader>(cgroup_path);
else
{
chassert(version == CgroupsVersion::V1);
return std::make_shared<CgroupsV1Reader>(cgroup_path);
}
}
#endif
namespace
{
std::string_view sourceToString(MemoryWorker::MemoryUsageSource source)
{
switch (source)
{
case MemoryWorker::MemoryUsageSource::Cgroups: return "Cgroups";
case MemoryWorker::MemoryUsageSource::Jemalloc: return "Jemalloc";
case MemoryWorker::MemoryUsageSource::None: return "None";
}
}
}
/// We try to pick the best possible supported source for reading memory usage.
/// Supported sources in order of priority
/// - reading from cgroups' pseudo-files (fastest and most accurate)
/// - reading jemalloc's resident stat (doesn't take into account allocations that didn't use jemalloc)
/// Also, different tick rates are used because not all options are equally fast
MemoryWorker::MemoryWorker(uint64_t period_ms_)
: log(getLogger("MemoryWorker"))
, period_ms(period_ms_)
{
#if defined(OS_LINUX)
try
{
static constexpr uint64_t cgroups_memory_usage_tick_ms{50};
const auto [cgroup_path, version] = getCgroupsPath();
LOG_INFO(
getLogger("CgroupsReader"),
"Will create cgroup reader from '{}' (cgroups version: {})",
cgroup_path,
(version == ICgroupsReader::CgroupsVersion::V1) ? "v1" : "v2");
cgroups_reader = ICgroupsReader::createCgroupsReader(version, cgroup_path);
source = MemoryUsageSource::Cgroups;
if (period_ms == 0)
period_ms = cgroups_memory_usage_tick_ms;
return;
}
catch (...)
{
tryLogCurrentException(log, "Cannot use cgroups reader");
}
#endif
#if USE_JEMALLOC
static constexpr uint64_t jemalloc_memory_usage_tick_ms{100};
source = MemoryUsageSource::Jemalloc;
if (period_ms == 0)
period_ms = jemalloc_memory_usage_tick_ms;
#endif
}
MemoryWorker::MemoryUsageSource MemoryWorker::getSource()
{
return source;
}
void MemoryWorker::start()
{
if (source == MemoryUsageSource::None)
return;
LOG_INFO(
getLogger("MemoryWorker"),
"Starting background memory thread with period of {}ms, using {} as source",
period_ms,
sourceToString(source));
background_thread = ThreadFromGlobalPool([this] { backgroundThread(); });
}
MemoryWorker::~MemoryWorker()
{
{
std::unique_lock lock(mutex);
shutdown = true;
}
cv.notify_all();
if (background_thread.joinable())
background_thread.join();
}
uint64_t MemoryWorker::getMemoryUsage()
{
switch (source)
{
case MemoryUsageSource::Cgroups:
return cgroups_reader != nullptr ? cgroups_reader->readMemoryUsage() : 0;
case MemoryUsageSource::Jemalloc:
#if USE_JEMALLOC
return resident_mib.getValue();
#else
return 0;
#endif
case MemoryUsageSource::None:
throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Trying to fetch memory usage while no memory source can be used");
}
}
void MemoryWorker::backgroundThread()
{
std::chrono::milliseconds chrono_period_ms{period_ms};
[[maybe_unused]] bool first_run = true;
std::unique_lock lock(mutex);
while (true)
{
cv.wait_for(lock, chrono_period_ms, [this] { return shutdown; });
if (shutdown)
return;
Stopwatch total_watch;
#if USE_JEMALLOC
if (source == MemoryUsageSource::Jemalloc)
epoch_mib.setValue(0);
#endif
Int64 resident = getMemoryUsage();
MemoryTracker::updateRSS(resident);
#if USE_JEMALLOC
if (resident > total_memory_tracker.getHardLimit())
{
Stopwatch purge_watch;
purge_mib.run();
ProfileEvents::increment(ProfileEvents::MemoryAllocatorPurge);
ProfileEvents::increment(ProfileEvents::MemoryAllocatorPurgeTimeMicroseconds, purge_watch.elapsedMicroseconds());
}
#endif
#if USE_JEMALLOC
if (unlikely(first_run || total_memory_tracker.get() < 0))
{
if (source != MemoryUsageSource::Jemalloc)
epoch_mib.setValue(0);
MemoryTracker::updateAllocated(allocated_mib.getValue());
}
#endif
ProfileEvents::increment(ProfileEvents::MemoryWorkerRun);
ProfileEvents::increment(ProfileEvents::MemoryWorkerRunElapsedMicroseconds, total_watch.elapsedMicroseconds());
first_run = false;
}
}
}

84
src/Common/MemoryWorker.h Normal file
View File

@ -0,0 +1,84 @@
#pragma once
#include <Common/CgroupsMemoryUsageObserver.h>
#include <Common/ThreadPool.h>
#include <Common/Jemalloc.h>
namespace DB
{
struct ICgroupsReader
{
enum class CgroupsVersion : uint8_t
{
V1,
V2
};
#if defined(OS_LINUX)
static std::shared_ptr<ICgroupsReader>
createCgroupsReader(ICgroupsReader::CgroupsVersion version, const std::filesystem::path & cgroup_path);
#endif
virtual ~ICgroupsReader() = default;
virtual uint64_t readMemoryUsage() = 0;
virtual std::string dumpAllStats() = 0;
};
/// Correct MemoryTracker based on external information (e.g. Cgroups or stats.resident from jemalloc)
/// The worker spawns a background thread which periodically reads current resident memory from the source,
/// whose value is sent to global MemoryTracker.
/// It can do additional things like purging jemalloc dirty pages if the current memory usage is higher than global hard limit.
class MemoryWorker
{
public:
explicit MemoryWorker(uint64_t period_ms_);
enum class MemoryUsageSource : uint8_t
{
None,
Cgroups,
Jemalloc
};
MemoryUsageSource getSource();
void start();
~MemoryWorker();
private:
uint64_t getMemoryUsage();
void backgroundThread();
ThreadFromGlobalPool background_thread;
std::mutex mutex;
std::condition_variable cv;
bool shutdown = false;
LoggerPtr log;
uint64_t period_ms;
MemoryUsageSource source{MemoryUsageSource::None};
std::shared_ptr<ICgroupsReader> cgroups_reader;
#if USE_JEMALLOC
JemallocMibCache<uint64_t> epoch_mib{"epoch"};
JemallocMibCache<size_t> resident_mib{"stats.resident"};
JemallocMibCache<size_t> allocated_mib{"stats.allocated"};
#define STRINGIFY_HELPER(x) #x
#define STRINGIFY(x) STRINGIFY_HELPER(x)
JemallocMibCache<size_t> purge_mib{"arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".purge"};
#undef STRINGIFY
#undef STRINGIFY_HELPER
#endif
};
}

View File

@ -827,6 +827,9 @@ The server successfully detected this situation and will download merged part fr
M(GWPAsanAllocateSuccess, "Number of successful allocations done by GWPAsan") \
M(GWPAsanAllocateFailed, "Number of failed allocations done by GWPAsan (i.e. filled pool)") \
M(GWPAsanFree, "Number of free operations done by GWPAsan") \
\
M(MemoryWorkerRun, "Number of runs done by MemoryWorker in background") \
M(MemoryWorkerRunElapsedMicroseconds, "Total time spent by MemoryWorker for background work") \
#ifdef APPLY_FOR_EXTERNAL_EVENTS

View File

@ -6,7 +6,7 @@
#include <filesystem>
#include <IO/WriteBufferFromFile.h>
#include <Common/CgroupsMemoryUsageObserver.h>
#include <Common/MemoryWorker.h>
#include <Common/filesystemHelpers.h>
using namespace DB;
@ -126,7 +126,7 @@ const std::string EXPECTED[2]
"\"workingset_restore_anon\": 0, \"workingset_restore_file\": 0, \"zswap\": 0, \"zswapped\": 0, \"zswpin\": 0, \"zswpout\": 0}"};
class CgroupsMemoryUsageObserverFixture : public ::testing::TestWithParam<CgroupsMemoryUsageObserver::CgroupsVersion>
class CgroupsMemoryUsageObserverFixture : public ::testing::TestWithParam<ICgroupsReader::CgroupsVersion>
{
void SetUp() override
{
@ -138,7 +138,7 @@ class CgroupsMemoryUsageObserverFixture : public ::testing::TestWithParam<Cgroup
stat_file.write(SAMPLE_FILE[version].data(), SAMPLE_FILE[version].size());
stat_file.sync();
if (GetParam() == CgroupsMemoryUsageObserver::CgroupsVersion::V2)
if (GetParam() == ICgroupsReader::CgroupsVersion::V2)
{
auto current_file = WriteBufferFromFile(tmp_dir + "/memory.current");
current_file.write("29645422592", 11);
@ -154,18 +154,18 @@ protected:
TEST_P(CgroupsMemoryUsageObserverFixture, ReadMemoryUsageTest)
{
const auto version = GetParam();
auto reader = createCgroupsReader(version, tmp_dir);
auto reader = ICgroupsReader::createCgroupsReader(version, tmp_dir);
ASSERT_EQ(
reader->readMemoryUsage(),
version == CgroupsMemoryUsageObserver::CgroupsVersion::V1 ? /* rss from memory.stat */ 2232029184
: /* value from memory.current - inactive_file */ 20952338432);
version == ICgroupsReader::CgroupsVersion::V1 ? /* rss from memory.stat */ 2232029184
: /* anon from memory.stat */ 10429399040);
}
TEST_P(CgroupsMemoryUsageObserverFixture, DumpAllStatsTest)
{
const auto version = GetParam();
auto reader = createCgroupsReader(version, tmp_dir);
auto reader = ICgroupsReader::createCgroupsReader(version, tmp_dir);
ASSERT_EQ(reader->dumpAllStats(), EXPECTED[static_cast<uint8_t>(version)]);
}
@ -173,6 +173,6 @@ TEST_P(CgroupsMemoryUsageObserverFixture, DumpAllStatsTest)
INSTANTIATE_TEST_SUITE_P(
CgroupsMemoryUsageObserverTests,
CgroupsMemoryUsageObserverFixture,
::testing::Values(CgroupsMemoryUsageObserver::CgroupsVersion::V1, CgroupsMemoryUsageObserver::CgroupsVersion::V2));
::testing::Values(ICgroupsReader::CgroupsVersion::V1, ICgroupsReader::CgroupsVersion::V2));
#endif

View File

@ -114,8 +114,13 @@ void updateKeeperInformation(KeeperDispatcher & keeper_dispatcher, AsynchronousM
}
KeeperAsynchronousMetrics::KeeperAsynchronousMetrics(
ContextPtr context_, unsigned update_period_seconds, const ProtocolServerMetricsFunc & protocol_server_metrics_func_)
: AsynchronousMetrics(update_period_seconds, protocol_server_metrics_func_), context(std::move(context_))
ContextPtr context_,
unsigned update_period_seconds,
const ProtocolServerMetricsFunc & protocol_server_metrics_func_,
bool update_jemalloc_epoch_,
bool update_rss_)
: AsynchronousMetrics(update_period_seconds, protocol_server_metrics_func_, update_jemalloc_epoch_, update_rss_)
, context(std::move(context_))
{
}

View File

@ -13,9 +13,13 @@ class KeeperAsynchronousMetrics : public AsynchronousMetrics
{
public:
KeeperAsynchronousMetrics(
ContextPtr context_, unsigned update_period_seconds, const ProtocolServerMetricsFunc & protocol_server_metrics_func_);
~KeeperAsynchronousMetrics() override;
ContextPtr context_,
unsigned update_period_seconds,
const ProtocolServerMetricsFunc & protocol_server_metrics_func_,
bool update_jemalloc_epoch_,
bool update_rss_);
~KeeperAsynchronousMetrics() override;
private:
ContextPtr context;

View File

@ -148,7 +148,14 @@ void KeeperDispatcher::requestThread()
Int64 mem_soft_limit = keeper_context->getKeeperMemorySoftLimit();
if (configuration_and_settings->standalone_keeper && isExceedingMemorySoftLimit() && checkIfRequestIncreaseMem(request.request))
{
LOG_WARNING(log, "Processing requests refused because of max_memory_usage_soft_limit {}, the total used memory is {}, request type is {}", ReadableSize(mem_soft_limit), ReadableSize(total_memory_tracker.get()), request.request->getOpNum());
LOG_WARNING(
log,
"Processing requests refused because of max_memory_usage_soft_limit {}, the total allocated memory is {}, RSS is {}, request type "
"is {}",
ReadableSize(mem_soft_limit),
ReadableSize(total_memory_tracker.get()),
ReadableSize(total_memory_tracker.getRSS()),
request.request->getOpNum());
addErrorResponses({request}, Coordination::Error::ZCONNECTIONLOSS);
continue;
}

View File

@ -602,7 +602,7 @@ bool KeeperServer::isLeaderAlive() const
bool KeeperServer::isExceedingMemorySoftLimit() const
{
Int64 mem_soft_limit = keeper_context->getKeeperMemorySoftLimit();
return mem_soft_limit > 0 && total_memory_tracker.get() >= mem_soft_limit;
return mem_soft_limit > 0 && std::max(total_memory_tracker.get(), total_memory_tracker.getRSS()) >= mem_soft_limit;
}
/// TODO test whether taking failed peer in count

View File

@ -170,6 +170,7 @@ namespace DB
M(Bool, prepare_system_log_tables_on_startup, false, "If true, ClickHouse creates all configured `system.*_log` tables before the startup. It can be helpful if some startup scripts depend on these tables.", 0) \
M(Double, gwp_asan_force_sample_probability, 0.0003, "Probability that an allocation from specific places will be sampled by GWP Asan (i.e. PODArray allocations)", 0) \
M(UInt64, config_reload_interval_ms, 2000, "How often clickhouse will reload config and check for new changes", 0) \
M(UInt64, memory_worker_period_ms, 0, "Tick period of background memory worker which corrects memory tracker memory usages and cleans up unused pages during higher memory usage. If set to 0, default value will be used depending on the memory usage source", 0) \
M(Bool, disable_insertion_and_mutation, false, "Disable all insert/alter/delete queries. This setting will be enabled if someone needs read-only nodes to prevent insertion and mutation affect reading performance.", 0)
/// If you add a setting which can be updated at runtime, please update 'changeable_settings' map in StorageSystemServerSettings.cpp

View File

@ -5,6 +5,7 @@
#include <Poco/UUID.h>
#include <Poco/Util/Application.h>
#include <Common/AsyncLoader.h>
#include <Common/CgroupsMemoryUsageObserver.h>
#include <Common/PoolId.h>
#include <Common/SensitiveDataMasker.h>
#include <Common/Macros.h>

View File

@ -152,6 +152,7 @@ class ServerType;
template <class Queue>
class MergeTreeBackgroundExecutor;
class AsyncLoader;
struct ICgroupsReader;
struct TemporaryTableHolder;
using TemporaryTablesMapping = std::map<String, std::shared_ptr<TemporaryTableHolder>>;

View File

@ -55,9 +55,11 @@ ServerAsynchronousMetrics::ServerAsynchronousMetrics(
ContextPtr global_context_,
unsigned update_period_seconds,
unsigned heavy_metrics_update_period_seconds,
const ProtocolServerMetricsFunc & protocol_server_metrics_func_)
const ProtocolServerMetricsFunc & protocol_server_metrics_func_,
bool update_jemalloc_epoch_,
bool update_rss_)
: WithContext(global_context_)
, AsynchronousMetrics(update_period_seconds, protocol_server_metrics_func_)
, AsynchronousMetrics(update_period_seconds, protocol_server_metrics_func_, update_jemalloc_epoch_, update_rss_)
, heavy_metric_update_period(heavy_metrics_update_period_seconds)
{
/// sanity check

View File

@ -14,7 +14,10 @@ public:
ContextPtr global_context_,
unsigned update_period_seconds,
unsigned heavy_metrics_update_period_seconds,
const ProtocolServerMetricsFunc & protocol_server_metrics_func_);
const ProtocolServerMetricsFunc & protocol_server_metrics_func_,
bool update_jemalloc_epoch_,
bool update_rss_);
~ServerAsynchronousMetrics() override;
private:

View File

@ -63,7 +63,6 @@ void StorageSystemServerSettings::fillData(MutableColumns & res_columns, Context
/// current setting values, one needs to ask the components directly.
std::unordered_map<String, std::pair<String, ChangeableWithoutRestart>> changeable_settings = {
{"max_server_memory_usage", {std::to_string(total_memory_tracker.getHardLimit()), ChangeableWithoutRestart::Yes}},
{"allow_use_jemalloc_memory", {std::to_string(total_memory_tracker.getAllowUseJemallocMmemory()), ChangeableWithoutRestart::Yes}},
{"max_table_size_to_drop", {std::to_string(context->getMaxTableSizeToDrop()), ChangeableWithoutRestart::Yes}},
{"max_partition_size_to_drop", {std::to_string(context->getMaxPartitionSizeToDrop()), ChangeableWithoutRestart::Yes}},

View File

@ -16,7 +16,7 @@
<value>az-zoo2</value>
<enable_auto_detection_on_cloud>1</enable_auto_detection_on_cloud>
</availability_zone>
<max_memory_usage_soft_limit>20000000</max_memory_usage_soft_limit>
<max_memory_usage_soft_limit>200000000</max_memory_usage_soft_limit>
<coordination_settings>
<operation_timeout_ms>10000</operation_timeout_ms>

View File

@ -13,7 +13,7 @@
<tcp_port>2181</tcp_port>
<server_id>3</server_id>
<max_memory_usage_soft_limit>20000000</max_memory_usage_soft_limit>
<max_memory_usage_soft_limit>200000000</max_memory_usage_soft_limit>
<coordination_settings>
<operation_timeout_ms>10000</operation_timeout_ms>

View File

@ -13,7 +13,6 @@ node = cluster.add_instance(
"configs/async_metrics_no.xml",
],
mem_limit="4g",
env_variables={"MALLOC_CONF": "dirty_decay_ms:0"},
)