ClickHouse/src/Common/ThreadProfileEvents.cpp

417 lines
15 KiB
C++
Raw Normal View History

#include "ThreadProfileEvents.h"
#if defined(__linux__)
#include "TaskStatsInfoGetter.h"
#include "ProcfsMetricsProvider.h"
#include "hasLinuxCapability.h"
#include <optional>
#include <unistd.h>
#include <linux/perf_event.h>
#include <syscall.h>
#include <sys/ioctl.h>
2020-02-19 16:35:01 +00:00
#include <cerrno>
2020-05-19 18:22:30 +00:00
#include <sys/types.h>
#include <dirent.h>
2020-04-13 21:10:31 +00:00
namespace DB
{
bool TasksStatsCounters::checkIfAvailable()
{
return findBestAvailableProvider() != MetricsProvider::None;
}
std::unique_ptr<TasksStatsCounters> TasksStatsCounters::create(const UInt64 tid)
{
std::unique_ptr<TasksStatsCounters> instance;
if (checkIfAvailable())
instance.reset(new TasksStatsCounters(tid, findBestAvailableProvider()));
return instance;
}
TasksStatsCounters::MetricsProvider TasksStatsCounters::findBestAvailableProvider()
{
/// This initialization is thread-safe and executed once since C++11
static std::optional<MetricsProvider> provider =
[]() -> MetricsProvider
{
if (TaskStatsInfoGetter::checkPermissions())
{
return MetricsProvider::Netlink;
}
else if (ProcfsMetricsProvider::isAvailable())
{
return MetricsProvider::Procfs;
}
return MetricsProvider::None;
}();
return *provider;
}
TasksStatsCounters::TasksStatsCounters(const UInt64 tid, const MetricsProvider provider)
{
switch (provider)
{
2020-05-18 22:26:35 +00:00
case MetricsProvider::Netlink:
stats_getter = [metrics_provider = std::make_shared<TaskStatsInfoGetter>(), tid]()
2020-05-18 22:27:19 +00:00
{
::taskstats result;
metrics_provider->getStat(result, tid);
return result;
};
2020-05-18 22:26:35 +00:00
break;
case MetricsProvider::Procfs:
stats_getter = [metrics_provider = std::make_shared<ProcfsMetricsProvider>(tid)]()
2020-05-18 22:27:19 +00:00
{
::taskstats result;
metrics_provider->getTaskStats(result);
return result;
};
2020-05-18 22:26:35 +00:00
break;
case MetricsProvider::None:
;
}
}
void TasksStatsCounters::reset()
{
if (stats_getter)
stats = stats_getter();
}
void TasksStatsCounters::updateCounters(ProfileEvents::Counters & profile_events)
{
if (!stats_getter)
return;
const auto new_stats = stats_getter();
incrementProfileEvents(stats, new_stats, profile_events);
stats = new_stats;
}
void TasksStatsCounters::incrementProfileEvents(const ::taskstats & prev, const ::taskstats & curr, ProfileEvents::Counters & profile_events)
{
profile_events.increment(ProfileEvents::OSCPUWaitMicroseconds,
safeDiff(prev.cpu_delay_total, curr.cpu_delay_total) / 1000U);
profile_events.increment(ProfileEvents::OSIOWaitMicroseconds,
safeDiff(prev.blkio_delay_total, curr.blkio_delay_total) / 1000U);
profile_events.increment(ProfileEvents::OSCPUVirtualTimeMicroseconds,
safeDiff(prev.cpu_run_virtual_total, curr.cpu_run_virtual_total) / 1000U);
/// Since TASKSTATS_VERSION = 3 extended accounting and IO accounting is available.
if (curr.version < 3)
return;
profile_events.increment(ProfileEvents::OSReadChars, safeDiff(prev.read_char, curr.read_char));
profile_events.increment(ProfileEvents::OSWriteChars, safeDiff(prev.write_char, curr.write_char));
profile_events.increment(ProfileEvents::OSReadBytes, safeDiff(prev.read_bytes, curr.read_bytes));
profile_events.increment(ProfileEvents::OSWriteBytes, safeDiff(prev.write_bytes, curr.write_bytes));
}
static PerfEventInfo softwareEvent(int event_config, ProfileEvents::Event profile_event)
{
return PerfEventInfo
{
.event_type = perf_type_id::PERF_TYPE_SOFTWARE,
.event_config = event_config,
.profile_event = profile_event,
.profile_event_running = std::nullopt,
.profile_event_enabled = std::nullopt
};
}
2020-05-18 11:44:24 +00:00
#define HARDWARE_EVENT(PERF_NAME, LOCAL_NAME) \
PerfEventInfo \
{ \
.event_type = perf_type_id::PERF_TYPE_HARDWARE, \
2020-05-19 14:01:51 +00:00
.event_config = (PERF_NAME), \
2020-05-18 11:44:24 +00:00
.profile_event = ProfileEvents::LOCAL_NAME, \
.profile_event_running = {ProfileEvents::LOCAL_NAME##Running}, \
.profile_event_enabled = {ProfileEvents::LOCAL_NAME##Enabled} \
}
// descriptions' source: http://man7.org/linux/man-pages/man2/perf_event_open.2.html
const PerfEventInfo PerfEventsCounters::raw_events_info[] = {
2020-05-18 11:44:24 +00:00
HARDWARE_EVENT(PERF_COUNT_HW_CPU_CYCLES, PerfCpuCycles),
HARDWARE_EVENT(PERF_COUNT_HW_INSTRUCTIONS, PerfInstructions),
HARDWARE_EVENT(PERF_COUNT_HW_CACHE_REFERENCES, PerfCacheReferences),
HARDWARE_EVENT(PERF_COUNT_HW_CACHE_MISSES, PerfCacheMisses),
HARDWARE_EVENT(PERF_COUNT_HW_BRANCH_INSTRUCTIONS, PerfBranchInstructions),
HARDWARE_EVENT(PERF_COUNT_HW_BRANCH_MISSES, PerfBranchMisses),
HARDWARE_EVENT(PERF_COUNT_HW_BUS_CYCLES, PerfBusCycles),
HARDWARE_EVENT(PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, PerfStalledCyclesFrontend),
HARDWARE_EVENT(PERF_COUNT_HW_STALLED_CYCLES_BACKEND, PerfStalledCyclesBackend),
HARDWARE_EVENT(PERF_COUNT_HW_REF_CPU_CYCLES, PerfRefCpuCycles),
// This reports the CPU clock, a high-resolution per-CPU timer.
// a bit broken according to this: https://stackoverflow.com/a/56967896
2020-05-18 11:44:24 +00:00
// softwareEvent(PERF_COUNT_SW_CPU_CLOCK, ProfileEvents::PerfCpuClock),
softwareEvent(PERF_COUNT_SW_TASK_CLOCK, ProfileEvents::PerfTaskClock),
softwareEvent(PERF_COUNT_SW_PAGE_FAULTS, ProfileEvents::PerfPageFaults),
softwareEvent(PERF_COUNT_SW_CONTEXT_SWITCHES, ProfileEvents::PerfContextSwitches),
softwareEvent(PERF_COUNT_SW_CPU_MIGRATIONS, ProfileEvents::PerfCpuMigrations),
softwareEvent(PERF_COUNT_SW_PAGE_FAULTS_MIN, ProfileEvents::PerfPageFaultsMin),
softwareEvent(PERF_COUNT_SW_PAGE_FAULTS_MAJ, ProfileEvents::PerfPageFaultsMaj),
softwareEvent(PERF_COUNT_SW_ALIGNMENT_FAULTS, ProfileEvents::PerfAlignmentFaults),
softwareEvent(PERF_COUNT_SW_EMULATION_FAULTS, ProfileEvents::PerfEmulationFaults)
// This is a placeholder event that counts nothing. Informational sample record types such as mmap or
// comm must be associated with an active event. This dummy event allows gathering such records
// without requiring a counting event.
// softwareEventInfo(PERF_COUNT_SW_DUMMY, ProfileEvents::PERF_COUNT_SW_DUMMY)
};
2020-05-18 11:44:24 +00:00
#undef HARDWARE_EVENT
std::atomic<PerfEventsCounters::Id> PerfEventsCounters::counters_id = 0;
std::atomic<bool> PerfEventsCounters::perf_unavailability_logged = false;
std::atomic<bool> PerfEventsCounters::particular_events_unavailability_logged = false;
2020-05-18 22:15:04 +00:00
thread_local PerfDescriptorsHolder PerfEventsCounters::thread_events_descriptors_holder{};
thread_local bool PerfEventsCounters::thread_events_descriptors_opened = false;
thread_local std::optional<PerfEventsCounters::Id> PerfEventsCounters::current_thread_counters_id = std::nullopt;
Logger * PerfEventsCounters::getLogger()
{
return &Logger::get("PerfEventsCounters");
}
static int openPerfEvent(perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, UInt64 flags)
{
return static_cast<int>(syscall(SYS_perf_event_open, hw_event, pid, cpu, group_fd, flags));
}
2020-05-18 22:17:19 +00:00
// cat /proc/sys/kernel/perf_event_paranoid
// -1: Allow use of (almost) all events by all users
// >=0: Disallow raw tracepoint access by users without CAP_IOC_LOCK
// >=1: Disallow CPU event access by users without CAP_SYS_ADMIN
// >=2: Disallow kernel profiling by users without CAP_SYS_ADMIN
// >=3: Disallow all event access by users without CAP_SYS_ADMIN
static bool getPerfEventParanoid(Int32 & result)
{
// the longest possible variant: "-1\0"
constexpr Int32 max_length = 3;
FILE * fp = fopen("/proc/sys/kernel/perf_event_paranoid", "r");
if (fp == nullptr)
return false;
char str[max_length];
char * res = fgets(str, max_length, fp);
fclose(fp);
if (res == nullptr)
return false;
str[max_length - 1] = '\0';
2020-05-19 14:01:51 +00:00
result = static_cast<Int32>(strtol(str, nullptr, 10));
return true;
}
static void perfEventOpenDisabled(Int32 perf_event_paranoid, bool has_cap_sys_admin, int perf_event_type, int perf_event_config, int & event_file_descriptor)
{
perf_event_attr pe = perf_event_attr();
pe.type = perf_event_type;
pe.size = sizeof(struct perf_event_attr);
pe.config = perf_event_config;
// disable by default to add as little extra time as possible
pe.disabled = 1;
// can record kernel only when `perf_event_paranoid` <= 1 or have CAP_SYS_ADMIN
pe.exclude_kernel = perf_event_paranoid >= 2 && !has_cap_sys_admin;
pe.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
event_file_descriptor = openPerfEvent(&pe, /* measure the calling thread */ 0, /* on any cpu */ -1, -1, 0);
}
bool PerfEventsCounters::initializeThreadLocalEvents(PerfEventsCounters & counters)
{
if (thread_events_descriptors_opened)
return true;
Int32 perf_event_paranoid = 0;
bool is_pref_available = getPerfEventParanoid(perf_event_paranoid);
if (!is_pref_available)
{
bool expected_value = false;
if (perf_unavailability_logged.compare_exchange_strong(expected_value, true))
LOG_INFO(getLogger(), "Perf events are unsupported");
return false;
}
bool has_cap_sys_admin = hasLinuxCapability(CAP_SYS_ADMIN);
if (perf_event_paranoid >= 3 && !has_cap_sys_admin)
2020-04-13 21:10:31 +00:00
{
bool expected_value = false;
if (perf_unavailability_logged.compare_exchange_strong(expected_value, true))
LOG_INFO(getLogger(), "Not enough permissions to record perf events");
return false;
}
2020-05-19 18:22:30 +00:00
rlimit64 limits{};
if (getrlimit64(RLIMIT_NOFILE, &limits))
{
LOG_WARNING(getLogger(), "Unable to get rlimit: errno = " << errno << ", message = " << strerror(errno));
return false;
}
UInt64 maximum_open_descriptors = limits.rlim_cur;
std::string dir_path("/proc/");
dir_path += std::to_string(getpid());
dir_path += "/fd";
DIR * fd_dir = opendir(dir_path.c_str());
if (fd_dir == nullptr)
{
LOG_WARNING(getLogger(), "Unable to get file descriptors used by the current process errno = " << errno
<< ", message = " << strerror(errno));
return false;
}
UInt64 opened_descriptors = 0;
while (readdir(fd_dir) != nullptr)
++opened_descriptors;
closedir(fd_dir);
UInt64 fd_count_afterwards = opened_descriptors + NUMBER_OF_RAW_EVENTS;
UInt64 threshold = static_cast<UInt64>(maximum_open_descriptors * FILE_DESCRIPTORS_THRESHOLD);
if (fd_count_afterwards > threshold)
{
LOG_WARNING(getLogger(), "Can't measure perf events as the result number of file descriptors ("
<< fd_count_afterwards << ") is more than the current threshold (" << threshold << " = "
<< maximum_open_descriptors << " * " << FILE_DESCRIPTORS_THRESHOLD << ")");
return false;
}
bool expected = false;
bool log_unsupported_event = particular_events_unavailability_logged.compare_exchange_strong(expected, true);
for (size_t i = 0; i < NUMBER_OF_RAW_EVENTS; ++i)
2020-04-13 21:10:31 +00:00
{
counters.raw_event_values[i] = {};
const PerfEventInfo & event_info = raw_events_info[i];
int & fd = thread_events_descriptors_holder.descriptors[i];
perfEventOpenDisabled(perf_event_paranoid, has_cap_sys_admin, event_info.event_type, event_info.event_config, fd);
if (fd == -1 && log_unsupported_event)
{
LOG_INFO(getLogger(), "Perf event is unsupported: event_type=" << event_info.event_type
<< ", event_config=" << event_info.event_config);
}
}
thread_events_descriptors_opened = true;
return true;
}
void PerfEventsCounters::initializeProfileEvents(PerfEventsCounters & counters)
{
if (current_thread_counters_id.has_value())
{
if (current_thread_counters_id != counters.id)
LOG_WARNING(getLogger(), "Only one instance of `PerfEventsCounters` can be used on the thread");
return;
}
if (!initializeThreadLocalEvents(counters))
return;
for (PerfEventValue & raw_value : counters.raw_event_values)
raw_value = {};
for (int fd : thread_events_descriptors_holder.descriptors)
{
if (fd != -1)
ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
}
current_thread_counters_id = counters.id;
}
void PerfEventsCounters::finalizeProfileEvents(PerfEventsCounters & counters, ProfileEvents::Counters & profile_events)
{
if (current_thread_counters_id != counters.id)
return;
if (!thread_events_descriptors_opened)
return;
2020-02-19 16:35:01 +00:00
// process raw events
2020-02-19 16:35:01 +00:00
// only read counters here to have as little overhead for processing as possible
for (size_t i = 0; i < NUMBER_OF_RAW_EVENTS; ++i)
2020-04-13 21:10:31 +00:00
{
int fd = counters.thread_events_descriptors_holder.descriptors[i];
if (fd == -1)
continue;
constexpr ssize_t bytes_to_read = sizeof(counters.raw_event_values[0]);
if (read(fd, &counters.raw_event_values[i], bytes_to_read) != bytes_to_read)
{
LOG_WARNING(getLogger(), "Can't read event value from file descriptor: " << fd);
counters.raw_event_values[i] = {};
}
}
// actually process counters' values and stop measuring
for (size_t i = 0; i < NUMBER_OF_RAW_EVENTS; ++i)
{
int fd = counters.thread_events_descriptors_holder.descriptors[i];
if (fd == -1)
continue;
2020-02-19 16:35:01 +00:00
const PerfEventInfo & info = raw_events_info[i];
const PerfEventValue & raw_value = counters.raw_event_values[i];
profile_events.increment(info.profile_event, raw_value.value);
if (info.profile_event_running.has_value())
profile_events.increment(info.profile_event_running.value(), raw_value.time_running);
if (info.profile_event_enabled.has_value())
profile_events.increment(info.profile_event_enabled.value(), raw_value.time_enabled);
2020-02-19 16:35:01 +00:00
if (ioctl(fd, PERF_EVENT_IOC_DISABLE, 0))
LOG_WARNING(getLogger(), "Can't disable perf event with file descriptor: " << fd);
if (ioctl(fd, PERF_EVENT_IOC_RESET, 0))
LOG_WARNING(getLogger(), "Can't reset perf event with file descriptor: " << fd);
}
current_thread_counters_id.reset();
}
PerfEventsCounters::PerfEventsCounters(): id(counters_id++) {}
PerfDescriptorsHolder::PerfDescriptorsHolder()
{
for (int & descriptor : descriptors)
descriptor = -1;
}
PerfDescriptorsHolder::~PerfDescriptorsHolder()
{
for (int & descriptor : descriptors)
{
if (descriptor == -1)
continue;
if (ioctl(descriptor, PERF_EVENT_IOC_DISABLE, 0))
LOG_WARNING(getLogger(), "Can't disable perf event with file descriptor: " << descriptor);
if (close(descriptor))
LOG_WARNING(getLogger(),"Can't close perf event file descriptor: " << descriptor
<< "; error: " << errno << " - " << strerror(errno));
descriptor = -1;
}
}
2020-04-23 18:46:19 +00:00
Logger * PerfDescriptorsHolder::getLogger()
{
return &Logger::get("PerfDescriptorsHolder");
}
}
2020-04-23 18:46:19 +00:00
#else
namespace DB
{
2020-02-19 16:35:01 +00:00
void PerfEventsCounters::initializeProfileEvents(PerfEventsCounters &) {}
void PerfEventsCounters::finalizeProfileEvents(PerfEventsCounters &, ProfileEvents::Counters &) {}
}
#endif