diff --git a/dbms/src/Common/ProfileEvents.cpp b/dbms/src/Common/ProfileEvents.cpp index cda51739c93..36f9e5c9392 100644 --- a/dbms/src/Common/ProfileEvents.cpp +++ b/dbms/src/Common/ProfileEvents.cpp @@ -178,6 +178,17 @@ M(OSReadChars, "Number of bytes read from filesystem, including page cache.") \ M(OSWriteChars, "Number of bytes written to filesystem, including page cache.") \ \ + M(PERF_COUNT_HW_CPU_CYCLES, "") \ + M(PERF_COUNT_HW_INSTRUCTIONS, "") \ + M(PERF_COUNT_HW_CACHE_REFERENCES, "") \ + M(PERF_COUNT_HW_CACHE_MISSES, "") \ + M(PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "") \ + M(PERF_COUNT_HW_BRANCH_MISSES, "") \ + M(PERF_COUNT_HW_BUS_CYCLES, "") \ + M(PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "") \ + M(PERF_COUNT_HW_STALLED_CYCLES_BACKEND, "") \ + M(PERF_COUNT_HW_REF_CPU_CYCLES, "") \ + \ M(PERF_COUNT_SW_TASK_CLOCK, "A clock count specific to the task that is running") \ M(PERF_COUNT_SW_PAGE_FAULTS, "Number of page faults") \ M(PERF_COUNT_SW_CONTEXT_SWITCHES, "Number of context switches") \ @@ -187,6 +198,9 @@ M(PERF_COUNT_SW_ALIGNMENT_FAULTS, "Number of alignment faults. These happen when unaligned memory accesses happen; the kernel can handle these but it reduces performance. This happens only on some architectures (never on x86).") \ M(PERF_COUNT_SW_EMULATION_FAULTS, "Number of emulation faults. The kernel sometimes traps on unimplemented instructions and emulates them for user space. This can negatively impact performance.") \ \ + M(PERF_CUSTOM_INSTRUCTIONS_PER_CPU_CYCLE_SCALED, "") \ + M(PERF_CUSTOM_INSTRUCTIONS_PER_CPU_CYCLE, "") \ + \ M(CreatedHTTPConnections, "Total amount of created HTTP connections (closed or opened).") \ \ M(CannotWriteToWriteBufferDiscard, "Number of stack traces dropped by query profiler or signal handler because pipe is full or cannot write to pipe.") \ diff --git a/dbms/src/Common/ThreadProfileEvents.cpp b/dbms/src/Common/ThreadProfileEvents.cpp index 959bda87b06..67fe1bcc6cd 100644 --- a/dbms/src/Common/ThreadProfileEvents.cpp +++ b/dbms/src/Common/ThreadProfileEvents.cpp @@ -12,56 +12,103 @@ namespace DB { #if defined(__linux__) - // todo: think about event counters' overflow - // todo: ask about the usual error reporting (whether stderr is an accepted way) + static PerfEventInfo softwareEvent(int event_config, ProfileEvents::Event profile_event) + { + return PerfEventInfo + { + .event_type = perf_type_id::PERF_TYPE_SOFTWARE, + .event_config = event_config, + .profile_event = profile_event + }; + } + + static PerfEventInfo hardwareEvent(int event_config, ProfileEvents::Event profile_event) + { + return PerfEventInfo + { + .event_type = perf_type_id::PERF_TYPE_HARDWARE, + .event_config = event_config, + .profile_event = profile_event + }; + } // descriptions' source: http://man7.org/linux/man-pages/man2/perf_event_open.2.html - const int PerfEventsCounters::perf_event_configs[] = { + const PerfEventInfo PerfEventsCounters::perf_raw_events_info[] = { + + // Total cycles. Be wary of what happens during CPU frequency scaling. + hardwareEvent(perf_hw_id::PERF_COUNT_HW_CPU_CYCLES, ProfileEvents::PERF_COUNT_HW_CPU_CYCLES), + // Retired instructions. Be careful, these can be affected by various issues, most notably hardware + // interrupt counts. + hardwareEvent(perf_hw_id::PERF_COUNT_HW_INSTRUCTIONS, ProfileEvents::PERF_COUNT_HW_INSTRUCTIONS), + // Cache accesses. Usually this indicates Last Level Cache accesses but this may vary depending on your CPU. + // This may include prefetches and coherency messages; again this depends on the design of your CPU. + hardwareEvent(perf_hw_id::PERF_COUNT_HW_CACHE_REFERENCES, ProfileEvents::PERF_COUNT_HW_CACHE_REFERENCES), + // Cache misses. Usually this indicates Last Level Cache misses; this is intended to be used in con‐junction + // with the PERF_COUNT_HW_CACHE_REFERENCES event to calculate cache miss rates. + hardwareEvent(perf_hw_id::PERF_COUNT_HW_CACHE_MISSES, ProfileEvents::PERF_COUNT_HW_CACHE_MISSES), + // Retired branch instructions. Prior to Linux 2.6.35, this used the wrong event on AMD processors. + hardwareEvent(perf_hw_id::PERF_COUNT_HW_BRANCH_INSTRUCTIONS, ProfileEvents::PERF_COUNT_HW_BRANCH_INSTRUCTIONS), + // Mispredicted branch instructions. + hardwareEvent(perf_hw_id::PERF_COUNT_HW_BRANCH_MISSES, ProfileEvents::PERF_COUNT_HW_BRANCH_MISSES), + // Bus cycles, which can be different from total cycles. + hardwareEvent(perf_hw_id::PERF_COUNT_HW_BUS_CYCLES, ProfileEvents::PERF_COUNT_HW_BUS_CYCLES), + // Stalled cycles during issue. + hardwareEvent(perf_hw_id::PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, ProfileEvents::PERF_COUNT_HW_STALLED_CYCLES_FRONTEND), + // Stalled cycles during retirement. + hardwareEvent(perf_hw_id::PERF_COUNT_HW_STALLED_CYCLES_BACKEND, ProfileEvents::PERF_COUNT_HW_STALLED_CYCLES_BACKEND), + // Total cycles; not affected by CPU frequency scaling. + hardwareEvent(perf_hw_id::PERF_COUNT_HW_REF_CPU_CYCLES, ProfileEvents::PERF_COUNT_HW_REF_CPU_CYCLES), + // This reports the CPU clock, a high-resolution per-CPU timer. -// perf_sw_ids::PERF_COUNT_SW_CPU_CLOCK, + // a bit broken according to this: https://stackoverflow.com/a/56967896 +// makeInfo(perf_type_id::PERF_TYPE_SOFTWARE, perf_sw_ids::PERF_COUNT_SW_CPU_CLOCK, ProfileEvents::PERF_COUNT_SW_CPU_CLOCK), // This reports a clock count specific to the task that is running. - perf_sw_ids::PERF_COUNT_SW_TASK_CLOCK, + softwareEvent(perf_sw_ids::PERF_COUNT_SW_TASK_CLOCK, ProfileEvents::PERF_COUNT_SW_TASK_CLOCK), // This reports the number of page faults. - perf_sw_ids::PERF_COUNT_SW_PAGE_FAULTS, + softwareEvent(perf_sw_ids::PERF_COUNT_SW_PAGE_FAULTS, ProfileEvents::PERF_COUNT_SW_PAGE_FAULTS), // This counts context switches. // Until Linux 2.6.34, these were all reported as user-space events, // after that they are reported as happening in the kernel - perf_sw_ids::PERF_COUNT_SW_CONTEXT_SWITCHES, + softwareEvent(perf_sw_ids::PERF_COUNT_SW_CONTEXT_SWITCHES, ProfileEvents::PERF_COUNT_SW_CONTEXT_SWITCHES), // This reports the number of times the process has migrated to a new CPU. - perf_sw_ids::PERF_COUNT_SW_CPU_MIGRATIONS, + softwareEvent(perf_sw_ids::PERF_COUNT_SW_CPU_MIGRATIONS, ProfileEvents::PERF_COUNT_SW_CPU_MIGRATIONS), // This counts the number of minor page faults. These did not require disk I/O to handle. - perf_sw_ids::PERF_COUNT_SW_PAGE_FAULTS_MIN, + softwareEvent(perf_sw_ids::PERF_COUNT_SW_PAGE_FAULTS_MIN, ProfileEvents::PERF_COUNT_SW_PAGE_FAULTS_MIN), // This counts the number of major page faults. These required disk I/O to handle. - perf_sw_ids::PERF_COUNT_SW_PAGE_FAULTS_MAJ, + softwareEvent(perf_sw_ids::PERF_COUNT_SW_PAGE_FAULTS_MAJ, ProfileEvents::PERF_COUNT_SW_PAGE_FAULTS_MAJ), // This counts the number of alignment faults. These happen when unaligned memory accesses happen; // the kernel can handle these but it reduces performance. // This happens only on some architectures (never on x86). - perf_sw_ids::PERF_COUNT_SW_ALIGNMENT_FAULTS, + softwareEvent(perf_sw_ids::PERF_COUNT_SW_ALIGNMENT_FAULTS, ProfileEvents::PERF_COUNT_SW_ALIGNMENT_FAULTS), // This counts the number of emulation faults. The kernel sometimes traps on unimplemented instructions and // emulates them for user space. This can negatively impact performance. - perf_sw_ids::PERF_COUNT_SW_EMULATION_FAULTS + softwareEvent(perf_sw_ids::PERF_COUNT_SW_EMULATION_FAULTS, ProfileEvents::PERF_COUNT_SW_EMULATION_FAULTS) // This is a placeholder event that counts nothing. Informational sample record types such as mmap or // comm must be associated with an active event. This dummy event allows gathering such records // without requiring a counting event. -// perf_sw_ids::PERF_COUNT_SW_DUMMY +// softwareEventInfo(perf_sw_ids::PERF_COUNT_SW_DUMMY, ProfileEvents::PERF_COUNT_SW_DUMMY) }; - const ProfileEvents::Event PerfEventsCounters::perf_events[] = { - // a bit broken according to this: https://stackoverflow.com/a/56967896 -// ProfileEvents::PERF_COUNT_SW_CPU_CLOCK, - ProfileEvents::PERF_COUNT_SW_TASK_CLOCK, - ProfileEvents::PERF_COUNT_SW_PAGE_FAULTS, - ProfileEvents::PERF_COUNT_SW_CONTEXT_SWITCHES, - ProfileEvents::PERF_COUNT_SW_CPU_MIGRATIONS, - ProfileEvents::PERF_COUNT_SW_PAGE_FAULTS_MIN, - ProfileEvents::PERF_COUNT_SW_PAGE_FAULTS_MAJ, - ProfileEvents::PERF_COUNT_SW_ALIGNMENT_FAULTS, - ProfileEvents::PERF_COUNT_SW_EMULATION_FAULTS -// ProfileEvents::PERF_COUNT_SW_DUMMY, - }; + static_assert(std::size(PerfEventsCounters::perf_raw_events_info) == PerfEventsCounters::NUMBER_OF_RAW_EVENTS); - static_assert(std::size(PerfEventsCounters::perf_event_configs) == PerfEventsCounters::NUMBER_OF_EVENTS); - static_assert(std::size(PerfEventsCounters::perf_events) == PerfEventsCounters::NUMBER_OF_EVENTS); + const std::map PerfEventsCounters::event_config_to_info_index = [] { + std::map map; + for (size_t i = 0; i < PerfEventsCounters::NUMBER_OF_RAW_EVENTS; ++i) + map.emplace(PerfEventsCounters::perf_raw_events_info[i].event_config, i); + return map; + } (); + + std::atomic PerfEventsCounters::events_availability_logged = false; + + Logger * PerfEventsCounters::getLogger() + { + return &Logger::get("PerfEventsCounters"); + } + + long long PerfEventsCounters::getRawValue(int event_config) + { + return raw_event_values[event_config_to_info_index.find(event_config)->second]; + } static int openPerfEvent(perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) { return static_cast(syscall(SYS_perf_event_open, hw_event, pid, cpu, group_fd, flags)); @@ -89,7 +136,7 @@ namespace DB { return true; } - static void perfEventStart(int perf_event_paranoid, int perf_event_type, int perf_event_config, int &event_file_descriptor) { + static void perfEventOpenDisabled(int perf_event_paranoid, int perf_event_type, int perf_event_config, int &event_file_descriptor) { perf_event_attr pe = perf_event_attr(); pe.type = perf_event_type; pe.size = sizeof(struct perf_event_attr); @@ -113,20 +160,28 @@ namespace DB { if (!is_pref_available) return; - for (size_t i = 0; i < NUMBER_OF_EVENTS; ++i) + bool expected = false; + bool log_unsupported_event = events_availability_logged.compare_exchange_strong(expected, true); + for (size_t i = 0; i < NUMBER_OF_RAW_EVENTS; ++i) { - perfEventStart(perf_event_paranoid, perf_type_id::PERF_TYPE_SOFTWARE, perf_event_configs[i], counters.events_descriptors[i]); + counters.raw_event_values[i] = 0; + const PerfEventInfo & event_info = perf_raw_events_info[i]; + int & fd = counters.events_descriptors[i]; + perfEventOpenDisabled(perf_event_paranoid, event_info.event_type, event_info.event_config, fd); + + if (fd == -1 && log_unsupported_event) + { + LOG_WARNING( + getLogger(), + "Perf event is unsupported: event_type=" << event_info.event_type + << ", event_config=" << event_info.event_config); + } } - for (size_t i = 0; i < NUMBER_OF_EVENTS; ++i) + for (int fd : counters.events_descriptors) { - int fd = counters.events_descriptors[i]; - if (fd == -1) { - fprintf(stderr, "Event config %d is unsupported\n", perf_event_configs[i]); - continue; - } - - ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); + if (fd != -1) + ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); } counters.perf_events_recording = true; @@ -136,26 +191,48 @@ namespace DB { if (!counters.perf_events_recording) return; - for (size_t i = 0; i < NUMBER_OF_EVENTS; ++i) { + // process raw events + + // only read counters here to have as little overhead for processing as possible + for (size_t i = 0; i < NUMBER_OF_RAW_EVENTS; ++i) + { + int fd = counters.events_descriptors[i]; + if (fd != -1) + read(fd, &counters.raw_event_values[i], sizeof(long long)); + } + + // actually process counters' values and release resources + for (size_t i = 0; i < NUMBER_OF_RAW_EVENTS; ++i) + { int & fd = counters.events_descriptors[i]; if (fd == -1) continue; - long long count; - read(fd, &count, sizeof(count)); - - profile_events.increment(perf_events[i], static_cast(count)); -// printf("%s: %lld\n", perf_event_names[i].c_str(), count); + profile_events.increment(perf_raw_events_info[i].profile_event, counters.raw_event_values[i]); if (ioctl(fd, PERF_EVENT_IOC_DISABLE, 0)) - fprintf(stderr, "Can't disable perf event with file descriptor: %d\n", fd); + LOG_WARNING(getLogger(), "Can't disable perf event with file descriptor: " << fd); if (close(fd)) - fprintf(stderr, "Can't close perf event file descriptor: %d; error: %d - %s\n", fd, errno, strerror(errno)); + LOG_WARNING(getLogger(), "Can't close perf event file descriptor: " << fd << "; error: " << errno << " - " << strerror(errno)); fd = -1; } + // process custom events which depend on the raw ones + long long hw_cpu_cycles = counters.getRawValue(perf_hw_id::PERF_COUNT_HW_CPU_CYCLES); + long long hw_ref_cpu_cycles = counters.getRawValue(perf_hw_id::PERF_COUNT_HW_REF_CPU_CYCLES); + + long long instructions_per_cpu_scaled = hw_cpu_cycles != 0 + ? counters.getRawValue(perf_hw_id::PERF_COUNT_HW_INSTRUCTIONS) / hw_cpu_cycles + : 0; + long long instructions_per_cpu = hw_ref_cpu_cycles != 0 + ? counters.getRawValue(perf_hw_id::PERF_COUNT_HW_INSTRUCTIONS) / hw_ref_cpu_cycles + : 0; + + profile_events.increment(ProfileEvents::PERF_CUSTOM_INSTRUCTIONS_PER_CPU_CYCLE_SCALED, instructions_per_cpu_scaled); + profile_events.increment(ProfileEvents::PERF_CUSTOM_INSTRUCTIONS_PER_CPU_CYCLE, instructions_per_cpu); + counters.perf_events_recording = false; } diff --git a/dbms/src/Common/ThreadProfileEvents.h b/dbms/src/Common/ThreadProfileEvents.h index 767df93c43e..2ca262bfd60 100644 --- a/dbms/src/Common/ThreadProfileEvents.h +++ b/dbms/src/Common/ThreadProfileEvents.h @@ -5,6 +5,8 @@ #include #include #include +#include +#include #if defined(__linux__) #include @@ -35,6 +37,17 @@ namespace ProfileEvents extern const Event OSReadBytes; extern const Event OSWriteBytes; + extern const Event PERF_COUNT_HW_CPU_CYCLES; + extern const Event PERF_COUNT_HW_INSTRUCTIONS; + extern const Event PERF_COUNT_HW_CACHE_REFERENCES; + extern const Event PERF_COUNT_HW_CACHE_MISSES; + extern const Event PERF_COUNT_HW_BRANCH_INSTRUCTIONS; + extern const Event PERF_COUNT_HW_BRANCH_MISSES; + extern const Event PERF_COUNT_HW_BUS_CYCLES; + extern const Event PERF_COUNT_HW_STALLED_CYCLES_FRONTEND; + extern const Event PERF_COUNT_HW_STALLED_CYCLES_BACKEND; + extern const Event PERF_COUNT_HW_REF_CPU_CYCLES; + // extern const Event PERF_COUNT_SW_CPU_CLOCK; extern const Event PERF_COUNT_SW_TASK_CLOCK; extern const Event PERF_COUNT_SW_PAGE_FAULTS; @@ -44,6 +57,9 @@ namespace ProfileEvents extern const Event PERF_COUNT_SW_PAGE_FAULTS_MAJ; extern const Event PERF_COUNT_SW_ALIGNMENT_FAULTS; extern const Event PERF_COUNT_SW_EMULATION_FAULTS; + + extern const Event PERF_CUSTOM_INSTRUCTIONS_PER_CPU_CYCLE_SCALED; + extern const Event PERF_CUSTOM_INSTRUCTIONS_PER_CPU_CYCLE; #endif } @@ -127,6 +143,18 @@ struct RUsageCounters } }; +#if defined(__linux__) + + struct PerfEventInfo { + // see perf_event.h/perf_type_id enum + int event_type; + // see configs in perf_event.h + int event_config; + ProfileEvents::Event profile_event; + }; + +#endif + struct PerfEventsCounters { // cat /proc/sys/kernel/perf_event_paranoid - if perf_event_paranoid is set to 3, all calls to `perf_event_open` are rejected (even for the current process) @@ -143,18 +171,28 @@ struct PerfEventsCounters // todo: check whether perf_event_open() is available with CAP_SYS_ADMIN #if defined(__linux__) - static constexpr size_t NUMBER_OF_EVENTS = 8; + static constexpr size_t NUMBER_OF_RAW_EVENTS = 18; - static const int perf_event_configs[]; - static const ProfileEvents::Event perf_events[]; + static const PerfEventInfo perf_raw_events_info[]; + static const std::map event_config_to_info_index; - int events_descriptors[NUMBER_OF_EVENTS]; + int events_descriptors[NUMBER_OF_RAW_EVENTS]; + // temp array just to not create it each time event processing finishes + long long raw_event_values[NUMBER_OF_RAW_EVENTS]; bool perf_events_recording = false; #endif static void initializeProfileEvents(PerfEventsCounters & counters); static void finalizeProfileEvents(PerfEventsCounters & counters, ProfileEvents::Counters & profile_events); + +private: + // used to write information about perf event availability only once for all threads + static std::atomic events_availability_logged; + + static Logger * getLogger(); + + long long getRawValue(int event_config); }; #if defined(__linux__)