From ec4e8ca5945694e3bcab98e9e77ae22db489bb15 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 11 Aug 2021 10:03:46 +0300 Subject: [PATCH 1/4] Firstly write current exception and then reopen block devices --- src/Interpreters/AsynchronousMetrics.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Interpreters/AsynchronousMetrics.cpp b/src/Interpreters/AsynchronousMetrics.cpp index 8efe959a623..f041d604516 100644 --- a/src/Interpreters/AsynchronousMetrics.cpp +++ b/src/Interpreters/AsynchronousMetrics.cpp @@ -967,6 +967,8 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti } catch (...) { + tryLogCurrentException(__PRETTY_FUNCTION__); + /// Try to reopen block devices in case of error /// (i.e. ENOENT means that some disk had been replaced, and it may apperas with a new name) try @@ -977,7 +979,6 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti { tryLogCurrentException(__PRETTY_FUNCTION__); } - tryLogCurrentException(__PRETTY_FUNCTION__); } if (net_dev) From eed5052a86a3cc4451df6a0cc9a48a30ceefe029 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 11 Aug 2021 09:54:55 +0300 Subject: [PATCH 2/4] Reopen hwmon sensors on error (/sys/class/hwmon) Sensors maybe recreated (i.e. on module load/unload, or suspend/resume cycle), so descriptors should be reopened. --- src/Interpreters/AsynchronousMetrics.cpp | 135 +++++++++++++---------- src/Interpreters/AsynchronousMetrics.h | 1 + 2 files changed, 79 insertions(+), 57 deletions(-) diff --git a/src/Interpreters/AsynchronousMetrics.cpp b/src/Interpreters/AsynchronousMetrics.cpp index f041d604516..cab87054902 100644 --- a/src/Interpreters/AsynchronousMetrics.cpp +++ b/src/Interpreters/AsynchronousMetrics.cpp @@ -102,6 +102,69 @@ AsynchronousMetrics::AsynchronousMetrics( thermal.emplace_back(std::move(file)); } + for (size_t edac_index = 0;; ++edac_index) + { + String edac_correctable_file = fmt::format("/sys/devices/system/edac/mc/mc{}/ce_count", edac_index); + String edac_uncorrectable_file = fmt::format("/sys/devices/system/edac/mc/mc{}/ue_count", edac_index); + + bool edac_correctable_file_exists = std::filesystem::exists(edac_correctable_file); + bool edac_uncorrectable_file_exists = std::filesystem::exists(edac_uncorrectable_file); + + if (!edac_correctable_file_exists && !edac_uncorrectable_file_exists) + { + if (edac_index == 0) + continue; + else + break; + } + + edac.emplace_back(); + + if (edac_correctable_file_exists) + edac.back().first = openFileIfExists(edac_correctable_file); + if (edac_uncorrectable_file_exists) + edac.back().second = openFileIfExists(edac_uncorrectable_file); + } + + openBlockDevices(); + openSensorsChips(); +#endif +} + +#if defined(OS_LINUX) +void AsynchronousMetrics::openBlockDevices() +{ + LOG_TRACE(log, "Scanning /sys/block"); + + if (!std::filesystem::exists("/sys/block")) + return; + + block_devices_rescan_delay.restart(); + + block_devs.clear(); + + for (const auto & device_dir : std::filesystem::directory_iterator("/sys/block")) + { + String device_name = device_dir.path().filename(); + + /// We are not interested in loopback devices. + if (device_name.starts_with("loop")) + continue; + + std::unique_ptr file = openFileIfExists(device_dir.path() / "stat"); + if (!file) + continue; + + block_devs[device_name] = std::move(file); + } +} + +void AsynchronousMetrics::openSensorsChips() +{ + LOG_TRACE(log, "Scanning /sys/class/hwmon"); + + hwmon_devices.clear(); + for (size_t hwmon_index = 0;; ++hwmon_index) { String hwmon_name_file = fmt::format("/sys/class/hwmon/hwmon{}/name", hwmon_index); @@ -150,61 +213,6 @@ AsynchronousMetrics::AsynchronousMetrics( hwmon_devices[hwmon_name][sensor_name] = std::move(file); } } - - for (size_t edac_index = 0;; ++edac_index) - { - String edac_correctable_file = fmt::format("/sys/devices/system/edac/mc/mc{}/ce_count", edac_index); - String edac_uncorrectable_file = fmt::format("/sys/devices/system/edac/mc/mc{}/ue_count", edac_index); - - bool edac_correctable_file_exists = std::filesystem::exists(edac_correctable_file); - bool edac_uncorrectable_file_exists = std::filesystem::exists(edac_uncorrectable_file); - - if (!edac_correctable_file_exists && !edac_uncorrectable_file_exists) - { - if (edac_index == 0) - continue; - else - break; - } - - edac.emplace_back(); - - if (edac_correctable_file_exists) - edac.back().first = openFileIfExists(edac_correctable_file); - if (edac_uncorrectable_file_exists) - edac.back().second = openFileIfExists(edac_uncorrectable_file); - } - - openBlockDevices(); -#endif -} - -#if defined(OS_LINUX) -void AsynchronousMetrics::openBlockDevices() -{ - LOG_TRACE(log, "Scanning /sys/block"); - - if (!std::filesystem::exists("/sys/block")) - return; - - block_devices_rescan_delay.restart(); - - block_devs.clear(); - - for (const auto & device_dir : std::filesystem::directory_iterator("/sys/block")) - { - String device_name = device_dir.path().filename(); - - /// We are not interested in loopback devices. - if (device_name.starts_with("loop")) - continue; - - std::unique_ptr file = openFileIfExists(device_dir.path() / "stat"); - if (!file) - continue; - - block_devs[device_name] = std::move(file); - } } #endif @@ -1084,9 +1092,9 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti } } - for (const auto & [hwmon_name, sensors] : hwmon_devices) + try { - try + for (const auto & [hwmon_name, sensors] : hwmon_devices) { for (const auto & [sensor_name, sensor_file] : sensors) { @@ -1107,6 +1115,19 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti new_values[fmt::format("Temperature_{}_{}", hwmon_name, sensor_name)] = temperature * 0.001; } } + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + + /// Files can be re-created on: + /// - module load/unload + /// - suspend/resume cycle + /// So file descriptors should be reopened. + try + { + openSensorsChips(); + } catch (...) { tryLogCurrentException(__PRETTY_FUNCTION__); diff --git a/src/Interpreters/AsynchronousMetrics.h b/src/Interpreters/AsynchronousMetrics.h index c8677ac3ced..a5d7f2ab98f 100644 --- a/src/Interpreters/AsynchronousMetrics.h +++ b/src/Interpreters/AsynchronousMetrics.h @@ -184,6 +184,7 @@ private: Stopwatch block_devices_rescan_delay; void openBlockDevices(); + void openSensorsChips(); #endif std::unique_ptr thread; From 3f91f61c3c4e8122b38bbe60bcbe6cde2b181501 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 11 Aug 2021 10:02:34 +0300 Subject: [PATCH 3/4] Reopen EDAC files (/sys/devices/system/edac) --- src/Interpreters/AsynchronousMetrics.cpp | 76 +++++++++++++++--------- src/Interpreters/AsynchronousMetrics.h | 1 + 2 files changed, 48 insertions(+), 29 deletions(-) diff --git a/src/Interpreters/AsynchronousMetrics.cpp b/src/Interpreters/AsynchronousMetrics.cpp index cab87054902..bf0bbe804fe 100644 --- a/src/Interpreters/AsynchronousMetrics.cpp +++ b/src/Interpreters/AsynchronousMetrics.cpp @@ -102,32 +102,9 @@ AsynchronousMetrics::AsynchronousMetrics( thermal.emplace_back(std::move(file)); } - for (size_t edac_index = 0;; ++edac_index) - { - String edac_correctable_file = fmt::format("/sys/devices/system/edac/mc/mc{}/ce_count", edac_index); - String edac_uncorrectable_file = fmt::format("/sys/devices/system/edac/mc/mc{}/ue_count", edac_index); - - bool edac_correctable_file_exists = std::filesystem::exists(edac_correctable_file); - bool edac_uncorrectable_file_exists = std::filesystem::exists(edac_uncorrectable_file); - - if (!edac_correctable_file_exists && !edac_uncorrectable_file_exists) - { - if (edac_index == 0) - continue; - else - break; - } - - edac.emplace_back(); - - if (edac_correctable_file_exists) - edac.back().first = openFileIfExists(edac_correctable_file); - if (edac_uncorrectable_file_exists) - edac.back().second = openFileIfExists(edac_uncorrectable_file); - } - openBlockDevices(); openSensorsChips(); + openEDAC(); #endif } @@ -159,6 +136,37 @@ void AsynchronousMetrics::openBlockDevices() } } +void AsynchronousMetrics::openEDAC() +{ + LOG_TRACE(log, "Scanning /sys/devices/system/edac"); + + edac.clear(); + + for (size_t edac_index = 0;; ++edac_index) + { + String edac_correctable_file = fmt::format("/sys/devices/system/edac/mc/mc{}/ce_count", edac_index); + String edac_uncorrectable_file = fmt::format("/sys/devices/system/edac/mc/mc{}/ue_count", edac_index); + + bool edac_correctable_file_exists = std::filesystem::exists(edac_correctable_file); + bool edac_uncorrectable_file_exists = std::filesystem::exists(edac_uncorrectable_file); + + if (!edac_correctable_file_exists && !edac_uncorrectable_file_exists) + { + if (edac_index == 0) + continue; + else + break; + } + + edac.emplace_back(); + + if (edac_correctable_file_exists) + edac.back().first = openFileIfExists(edac_correctable_file); + if (edac_uncorrectable_file_exists) + edac.back().second = openFileIfExists(edac_uncorrectable_file); + } +} + void AsynchronousMetrics::openSensorsChips() { LOG_TRACE(log, "Scanning /sys/class/hwmon"); @@ -1134,13 +1142,13 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti } } - for (size_t i = 0, size = edac.size(); i < size; ++i) + try { - /// NOTE maybe we need to take difference with previous values. - /// But these metrics should be exceptionally rare, so it's ok to keep them accumulated. - - try + for (size_t i = 0, size = edac.size(); i < size; ++i) { + /// NOTE maybe we need to take difference with previous values. + /// But these metrics should be exceptionally rare, so it's ok to keep them accumulated. + if (edac[i].first) { ReadBufferFromFilePRead & in = *edac[i].first; @@ -1159,6 +1167,16 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti new_values[fmt::format("EDAC{}_Uncorrectable", i)] = errors; } } + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + + /// EDAC files can be re-created on module load/unload + try + { + openEDAC(); + } catch (...) { tryLogCurrentException(__PRETTY_FUNCTION__); diff --git a/src/Interpreters/AsynchronousMetrics.h b/src/Interpreters/AsynchronousMetrics.h index a5d7f2ab98f..409f2dfeec4 100644 --- a/src/Interpreters/AsynchronousMetrics.h +++ b/src/Interpreters/AsynchronousMetrics.h @@ -185,6 +185,7 @@ private: void openBlockDevices(); void openSensorsChips(); + void openEDAC(); #endif std::unique_ptr thread; From 67ebcef978764220a881f9c1d2c9a354ef87bd05 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 11 Aug 2021 10:09:00 +0300 Subject: [PATCH 4/4] Reopen sensors (/sys/class/thermal) --- src/Interpreters/AsynchronousMetrics.cpp | 34 ++++++++++++++++++------ src/Interpreters/AsynchronousMetrics.h | 1 + 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/Interpreters/AsynchronousMetrics.cpp b/src/Interpreters/AsynchronousMetrics.cpp index bf0bbe804fe..fd02aa4abec 100644 --- a/src/Interpreters/AsynchronousMetrics.cpp +++ b/src/Interpreters/AsynchronousMetrics.cpp @@ -88,6 +88,20 @@ AsynchronousMetrics::AsynchronousMetrics( openFileIfExists("/proc/uptime", uptime); openFileIfExists("/proc/net/dev", net_dev); + openSensors(); + openBlockDevices(); + openEDAC(); + openSensorsChips(); +#endif +} + +#if defined(OS_LINUX) +void AsynchronousMetrics::openSensors() +{ + LOG_TRACE(log, "Scanning /sys/class/thermal"); + + thermal.clear(); + for (size_t thermal_device_index = 0;; ++thermal_device_index) { std::unique_ptr file = openFileIfExists(fmt::format("/sys/class/thermal/thermal_zone{}/temp", thermal_device_index)); @@ -101,14 +115,8 @@ AsynchronousMetrics::AsynchronousMetrics( } thermal.emplace_back(std::move(file)); } - - openBlockDevices(); - openSensorsChips(); - openEDAC(); -#endif } -#if defined(OS_LINUX) void AsynchronousMetrics::openBlockDevices() { LOG_TRACE(log, "Scanning /sys/block"); @@ -1083,9 +1091,9 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti } } - for (size_t i = 0, size = thermal.size(); i < size; ++i) + try { - try + for (size_t i = 0, size = thermal.size(); i < size; ++i) { ReadBufferFromFilePRead & in = *thermal[i]; @@ -1094,6 +1102,16 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti readText(temperature, in); new_values[fmt::format("Temperature{}", i)] = temperature * 0.001; } + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + + /// Files maybe re-created on module load/unload + try + { + openSensors(); + } catch (...) { tryLogCurrentException(__PRETTY_FUNCTION__); diff --git a/src/Interpreters/AsynchronousMetrics.h b/src/Interpreters/AsynchronousMetrics.h index 409f2dfeec4..93e77b6bde8 100644 --- a/src/Interpreters/AsynchronousMetrics.h +++ b/src/Interpreters/AsynchronousMetrics.h @@ -183,6 +183,7 @@ private: Stopwatch block_devices_rescan_delay; + void openSensors(); void openBlockDevices(); void openSensorsChips(); void openEDAC();