mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-10 09:32:06 +00:00
Merge pull request #27554 from azat/async-metrics-reopen
Fix polling of /sys/class on errors
This commit is contained in:
commit
a806fa6495
@ -88,6 +88,20 @@ AsynchronousMetrics::AsynchronousMetrics(
|
|||||||
openFileIfExists("/proc/uptime", uptime);
|
openFileIfExists("/proc/uptime", uptime);
|
||||||
openFileIfExists("/proc/net/dev", net_dev);
|
openFileIfExists("/proc/net/dev", net_dev);
|
||||||
|
|
||||||
|
openSensors();
|
||||||
|
openBlockDevices();
|
||||||
|
openEDAC();
|
||||||
|
openSensorsChips();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(OS_LINUX)
|
||||||
|
void AsynchronousMetrics::openSensors()
|
||||||
|
{
|
||||||
|
LOG_TRACE(log, "Scanning /sys/class/thermal");
|
||||||
|
|
||||||
|
thermal.clear();
|
||||||
|
|
||||||
for (size_t thermal_device_index = 0;; ++thermal_device_index)
|
for (size_t thermal_device_index = 0;; ++thermal_device_index)
|
||||||
{
|
{
|
||||||
std::unique_ptr<ReadBufferFromFilePRead> file = openFileIfExists(fmt::format("/sys/class/thermal/thermal_zone{}/temp", thermal_device_index));
|
std::unique_ptr<ReadBufferFromFilePRead> file = openFileIfExists(fmt::format("/sys/class/thermal/thermal_zone{}/temp", thermal_device_index));
|
||||||
@ -101,6 +115,71 @@ AsynchronousMetrics::AsynchronousMetrics(
|
|||||||
}
|
}
|
||||||
thermal.emplace_back(std::move(file));
|
thermal.emplace_back(std::move(file));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void AsynchronousMetrics::openBlockDevices()
|
||||||
|
{
|
||||||
|
LOG_TRACE(log, "Scanning /sys/block");
|
||||||
|
|
||||||
|
if (!std::filesystem::exists("/sys/block"))
|
||||||
|
return;
|
||||||
|
|
||||||
|
block_devices_rescan_delay.restart();
|
||||||
|
|
||||||
|
block_devs.clear();
|
||||||
|
|
||||||
|
for (const auto & device_dir : std::filesystem::directory_iterator("/sys/block"))
|
||||||
|
{
|
||||||
|
String device_name = device_dir.path().filename();
|
||||||
|
|
||||||
|
/// We are not interested in loopback devices.
|
||||||
|
if (device_name.starts_with("loop"))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
std::unique_ptr<ReadBufferFromFilePRead> file = openFileIfExists(device_dir.path() / "stat");
|
||||||
|
if (!file)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
block_devs[device_name] = std::move(file);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void AsynchronousMetrics::openEDAC()
|
||||||
|
{
|
||||||
|
LOG_TRACE(log, "Scanning /sys/devices/system/edac");
|
||||||
|
|
||||||
|
edac.clear();
|
||||||
|
|
||||||
|
for (size_t edac_index = 0;; ++edac_index)
|
||||||
|
{
|
||||||
|
String edac_correctable_file = fmt::format("/sys/devices/system/edac/mc/mc{}/ce_count", edac_index);
|
||||||
|
String edac_uncorrectable_file = fmt::format("/sys/devices/system/edac/mc/mc{}/ue_count", edac_index);
|
||||||
|
|
||||||
|
bool edac_correctable_file_exists = std::filesystem::exists(edac_correctable_file);
|
||||||
|
bool edac_uncorrectable_file_exists = std::filesystem::exists(edac_uncorrectable_file);
|
||||||
|
|
||||||
|
if (!edac_correctable_file_exists && !edac_uncorrectable_file_exists)
|
||||||
|
{
|
||||||
|
if (edac_index == 0)
|
||||||
|
continue;
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
edac.emplace_back();
|
||||||
|
|
||||||
|
if (edac_correctable_file_exists)
|
||||||
|
edac.back().first = openFileIfExists(edac_correctable_file);
|
||||||
|
if (edac_uncorrectable_file_exists)
|
||||||
|
edac.back().second = openFileIfExists(edac_uncorrectable_file);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void AsynchronousMetrics::openSensorsChips()
|
||||||
|
{
|
||||||
|
LOG_TRACE(log, "Scanning /sys/class/hwmon");
|
||||||
|
|
||||||
|
hwmon_devices.clear();
|
||||||
|
|
||||||
for (size_t hwmon_index = 0;; ++hwmon_index)
|
for (size_t hwmon_index = 0;; ++hwmon_index)
|
||||||
{
|
{
|
||||||
@ -150,61 +229,6 @@ AsynchronousMetrics::AsynchronousMetrics(
|
|||||||
hwmon_devices[hwmon_name][sensor_name] = std::move(file);
|
hwmon_devices[hwmon_name][sensor_name] = std::move(file);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t edac_index = 0;; ++edac_index)
|
|
||||||
{
|
|
||||||
String edac_correctable_file = fmt::format("/sys/devices/system/edac/mc/mc{}/ce_count", edac_index);
|
|
||||||
String edac_uncorrectable_file = fmt::format("/sys/devices/system/edac/mc/mc{}/ue_count", edac_index);
|
|
||||||
|
|
||||||
bool edac_correctable_file_exists = std::filesystem::exists(edac_correctable_file);
|
|
||||||
bool edac_uncorrectable_file_exists = std::filesystem::exists(edac_uncorrectable_file);
|
|
||||||
|
|
||||||
if (!edac_correctable_file_exists && !edac_uncorrectable_file_exists)
|
|
||||||
{
|
|
||||||
if (edac_index == 0)
|
|
||||||
continue;
|
|
||||||
else
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
edac.emplace_back();
|
|
||||||
|
|
||||||
if (edac_correctable_file_exists)
|
|
||||||
edac.back().first = openFileIfExists(edac_correctable_file);
|
|
||||||
if (edac_uncorrectable_file_exists)
|
|
||||||
edac.back().second = openFileIfExists(edac_uncorrectable_file);
|
|
||||||
}
|
|
||||||
|
|
||||||
openBlockDevices();
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined(OS_LINUX)
|
|
||||||
void AsynchronousMetrics::openBlockDevices()
|
|
||||||
{
|
|
||||||
LOG_TRACE(log, "Scanning /sys/block");
|
|
||||||
|
|
||||||
if (!std::filesystem::exists("/sys/block"))
|
|
||||||
return;
|
|
||||||
|
|
||||||
block_devices_rescan_delay.restart();
|
|
||||||
|
|
||||||
block_devs.clear();
|
|
||||||
|
|
||||||
for (const auto & device_dir : std::filesystem::directory_iterator("/sys/block"))
|
|
||||||
{
|
|
||||||
String device_name = device_dir.path().filename();
|
|
||||||
|
|
||||||
/// We are not interested in loopback devices.
|
|
||||||
if (device_name.starts_with("loop"))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
std::unique_ptr<ReadBufferFromFilePRead> file = openFileIfExists(device_dir.path() / "stat");
|
|
||||||
if (!file)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
block_devs[device_name] = std::move(file);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -967,6 +991,8 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti
|
|||||||
}
|
}
|
||||||
catch (...)
|
catch (...)
|
||||||
{
|
{
|
||||||
|
tryLogCurrentException(__PRETTY_FUNCTION__);
|
||||||
|
|
||||||
/// Try to reopen block devices in case of error
|
/// Try to reopen block devices in case of error
|
||||||
/// (i.e. ENOENT means that some disk had been replaced, and it may apperas with a new name)
|
/// (i.e. ENOENT means that some disk had been replaced, and it may apperas with a new name)
|
||||||
try
|
try
|
||||||
@ -977,7 +1003,6 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti
|
|||||||
{
|
{
|
||||||
tryLogCurrentException(__PRETTY_FUNCTION__);
|
tryLogCurrentException(__PRETTY_FUNCTION__);
|
||||||
}
|
}
|
||||||
tryLogCurrentException(__PRETTY_FUNCTION__);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (net_dev)
|
if (net_dev)
|
||||||
@ -1066,9 +1091,9 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0, size = thermal.size(); i < size; ++i)
|
|
||||||
{
|
|
||||||
try
|
try
|
||||||
|
{
|
||||||
|
for (size_t i = 0, size = thermal.size(); i < size; ++i)
|
||||||
{
|
{
|
||||||
ReadBufferFromFilePRead & in = *thermal[i];
|
ReadBufferFromFilePRead & in = *thermal[i];
|
||||||
|
|
||||||
@ -1077,15 +1102,25 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti
|
|||||||
readText(temperature, in);
|
readText(temperature, in);
|
||||||
new_values[fmt::format("Temperature{}", i)] = temperature * 0.001;
|
new_values[fmt::format("Temperature{}", i)] = temperature * 0.001;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
catch (...)
|
||||||
|
{
|
||||||
|
tryLogCurrentException(__PRETTY_FUNCTION__);
|
||||||
|
|
||||||
|
/// Files maybe re-created on module load/unload
|
||||||
|
try
|
||||||
|
{
|
||||||
|
openSensors();
|
||||||
|
}
|
||||||
catch (...)
|
catch (...)
|
||||||
{
|
{
|
||||||
tryLogCurrentException(__PRETTY_FUNCTION__);
|
tryLogCurrentException(__PRETTY_FUNCTION__);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (const auto & [hwmon_name, sensors] : hwmon_devices)
|
|
||||||
{
|
|
||||||
try
|
try
|
||||||
|
{
|
||||||
|
for (const auto & [hwmon_name, sensors] : hwmon_devices)
|
||||||
{
|
{
|
||||||
for (const auto & [sensor_name, sensor_file] : sensors)
|
for (const auto & [sensor_name, sensor_file] : sensors)
|
||||||
{
|
{
|
||||||
@ -1106,19 +1141,32 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti
|
|||||||
new_values[fmt::format("Temperature_{}_{}", hwmon_name, sensor_name)] = temperature * 0.001;
|
new_values[fmt::format("Temperature_{}_{}", hwmon_name, sensor_name)] = temperature * 0.001;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
catch (...)
|
||||||
|
{
|
||||||
|
tryLogCurrentException(__PRETTY_FUNCTION__);
|
||||||
|
|
||||||
|
/// Files can be re-created on:
|
||||||
|
/// - module load/unload
|
||||||
|
/// - suspend/resume cycle
|
||||||
|
/// So file descriptors should be reopened.
|
||||||
|
try
|
||||||
|
{
|
||||||
|
openSensorsChips();
|
||||||
|
}
|
||||||
catch (...)
|
catch (...)
|
||||||
{
|
{
|
||||||
tryLogCurrentException(__PRETTY_FUNCTION__);
|
tryLogCurrentException(__PRETTY_FUNCTION__);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
for (size_t i = 0, size = edac.size(); i < size; ++i)
|
for (size_t i = 0, size = edac.size(); i < size; ++i)
|
||||||
{
|
{
|
||||||
/// NOTE maybe we need to take difference with previous values.
|
/// NOTE maybe we need to take difference with previous values.
|
||||||
/// But these metrics should be exceptionally rare, so it's ok to keep them accumulated.
|
/// But these metrics should be exceptionally rare, so it's ok to keep them accumulated.
|
||||||
|
|
||||||
try
|
|
||||||
{
|
|
||||||
if (edac[i].first)
|
if (edac[i].first)
|
||||||
{
|
{
|
||||||
ReadBufferFromFilePRead & in = *edac[i].first;
|
ReadBufferFromFilePRead & in = *edac[i].first;
|
||||||
@ -1137,6 +1185,16 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti
|
|||||||
new_values[fmt::format("EDAC{}_Uncorrectable", i)] = errors;
|
new_values[fmt::format("EDAC{}_Uncorrectable", i)] = errors;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
catch (...)
|
||||||
|
{
|
||||||
|
tryLogCurrentException(__PRETTY_FUNCTION__);
|
||||||
|
|
||||||
|
/// EDAC files can be re-created on module load/unload
|
||||||
|
try
|
||||||
|
{
|
||||||
|
openEDAC();
|
||||||
|
}
|
||||||
catch (...)
|
catch (...)
|
||||||
{
|
{
|
||||||
tryLogCurrentException(__PRETTY_FUNCTION__);
|
tryLogCurrentException(__PRETTY_FUNCTION__);
|
||||||
|
@ -183,7 +183,10 @@ private:
|
|||||||
|
|
||||||
Stopwatch block_devices_rescan_delay;
|
Stopwatch block_devices_rescan_delay;
|
||||||
|
|
||||||
|
void openSensors();
|
||||||
void openBlockDevices();
|
void openBlockDevices();
|
||||||
|
void openSensorsChips();
|
||||||
|
void openEDAC();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
std::unique_ptr<ThreadFromGlobalPool> thread;
|
std::unique_ptr<ThreadFromGlobalPool> thread;
|
||||||
|
Loading…
Reference in New Issue
Block a user