mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-15 10:52:30 +00:00
03e90cd7cf
The following metrics can be useful to calculate various rates (i.e. disk/network IO rates): - AsynchronousHeavyMetricsUpdateInterval - AsynchronousMetricsUpdateInterval The following had been added by analogy with the AsynchronousMetricsCalculationTimeSpent: - AsynchronousHeavyMetricsCalculationTimeSpent Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
405 lines
21 KiB
C++
405 lines
21 KiB
C++
#include <Interpreters/ServerAsynchronousMetrics.h>
|
|
|
|
#include <Interpreters/Aggregator.h>
|
|
#include <Interpreters/AsynchronousMetricLog.h>
|
|
#include <Interpreters/DatabaseCatalog.h>
|
|
#include <Interpreters/Cache/FileCache.h>
|
|
#include <Interpreters/Cache/FileCacheFactory.h>
|
|
#include <Interpreters/Context.h>
|
|
#include <Interpreters/JIT/CompiledExpressionCache.h>
|
|
|
|
#include <Databases/IDatabase.h>
|
|
|
|
#include <IO/UncompressedCache.h>
|
|
#include <IO/MMappedFileCache.h>
|
|
|
|
#include <Storages/MergeTree/MergeTreeData.h>
|
|
#include <Storages/MergeTree/MergeTreeMetadataCache.h>
|
|
#include <Storages/StorageMergeTree.h>
|
|
#include <Storages/StorageReplicatedMergeTree.h>
|
|
#include <Storages/MarkCache.h>
|
|
|
|
#include <Coordination/KeeperAsynchronousMetrics.h>
|
|
|
|
namespace DB
|
|
{
|
|
|
|
namespace
|
|
{
|
|
|
|
template <typename Max, typename T>
|
|
void calculateMax(Max & max, T x)
|
|
{
|
|
if (Max(x) > max)
|
|
max = x;
|
|
}
|
|
|
|
template <typename Max, typename Sum, typename T>
|
|
void calculateMaxAndSum(Max & max, Sum & sum, T x)
|
|
{
|
|
sum += x;
|
|
if (Max(x) > max)
|
|
max = x;
|
|
}
|
|
|
|
}
|
|
|
|
ServerAsynchronousMetrics::ServerAsynchronousMetrics(
|
|
ContextPtr global_context_,
|
|
int update_period_seconds,
|
|
int heavy_metrics_update_period_seconds,
|
|
const ProtocolServerMetricsFunc & protocol_server_metrics_func_)
|
|
: AsynchronousMetrics(update_period_seconds, protocol_server_metrics_func_)
|
|
, WithContext(global_context_)
|
|
, heavy_metric_update_period(heavy_metrics_update_period_seconds)
|
|
{}
|
|
|
|
void ServerAsynchronousMetrics::updateImpl(AsynchronousMetricValues & new_values, TimePoint update_time, TimePoint current_time)
|
|
{
|
|
if (auto mark_cache = getContext()->getMarkCache())
|
|
{
|
|
new_values["MarkCacheBytes"] = { mark_cache->weight(), "Total size of mark cache in bytes" };
|
|
new_values["MarkCacheFiles"] = { mark_cache->count(), "Total number of mark files cached in the mark cache" };
|
|
}
|
|
|
|
if (auto uncompressed_cache = getContext()->getUncompressedCache())
|
|
{
|
|
new_values["UncompressedCacheBytes"] = { uncompressed_cache->weight(),
|
|
"Total size of uncompressed cache in bytes. Uncompressed cache does not usually improve the performance and should be mostly avoided." };
|
|
new_values["UncompressedCacheCells"] = { uncompressed_cache->count(),
|
|
"Total number of entries in the uncompressed cache. Each entry represents a decompressed block of data. Uncompressed cache does not usually improve performance and should be mostly avoided." };
|
|
}
|
|
|
|
if (auto index_mark_cache = getContext()->getIndexMarkCache())
|
|
{
|
|
new_values["IndexMarkCacheBytes"] = { index_mark_cache->weight(), "Total size of mark cache for secondary indices in bytes." };
|
|
new_values["IndexMarkCacheFiles"] = { index_mark_cache->count(), "Total number of mark files cached in the mark cache for secondary indices." };
|
|
}
|
|
|
|
if (auto index_uncompressed_cache = getContext()->getIndexUncompressedCache())
|
|
{
|
|
new_values["IndexUncompressedCacheBytes"] = { index_uncompressed_cache->weight(),
|
|
"Total size of uncompressed cache in bytes for secondary indices. Uncompressed cache does not usually improve the performance and should be mostly avoided." };
|
|
new_values["IndexUncompressedCacheCells"] = { index_uncompressed_cache->count(),
|
|
"Total number of entries in the uncompressed cache for secondary indices. Each entry represents a decompressed block of data. Uncompressed cache does not usually improve performance and should be mostly avoided." };
|
|
}
|
|
|
|
if (auto mmap_cache = getContext()->getMMappedFileCache())
|
|
{
|
|
new_values["MMapCacheCells"] = { mmap_cache->count(),
|
|
"The number of files opened with `mmap` (mapped in memory)."
|
|
" This is used for queries with the setting `local_filesystem_read_method` set to `mmap`."
|
|
" The files opened with `mmap` are kept in the cache to avoid costly TLB flushes."};
|
|
}
|
|
|
|
{
|
|
auto caches = FileCacheFactory::instance().getAll();
|
|
size_t total_bytes = 0;
|
|
size_t total_files = 0;
|
|
|
|
for (const auto & [_, cache_data] : caches)
|
|
{
|
|
total_bytes += cache_data->cache->getUsedCacheSize();
|
|
total_files += cache_data->cache->getFileSegmentsNum();
|
|
}
|
|
|
|
new_values["FilesystemCacheBytes"] = { total_bytes,
|
|
"Total bytes in the `cache` virtual filesystem. This cache is hold on disk." };
|
|
new_values["FilesystemCacheFiles"] = { total_files,
|
|
"Total number of cached file segments in the `cache` virtual filesystem. This cache is hold on disk." };
|
|
}
|
|
|
|
#if USE_ROCKSDB
|
|
if (auto metadata_cache = getContext()->tryGetMergeTreeMetadataCache())
|
|
{
|
|
new_values["MergeTreeMetadataCacheSize"] = { metadata_cache->getEstimateNumKeys(),
|
|
"The size of the metadata cache for tables. This cache is experimental and not used in production." };
|
|
}
|
|
#endif
|
|
|
|
#if USE_EMBEDDED_COMPILER
|
|
if (auto * compiled_expression_cache = CompiledExpressionCacheFactory::instance().tryGetCache())
|
|
{
|
|
new_values["CompiledExpressionCacheBytes"] = { compiled_expression_cache->weight(),
|
|
"Total bytes used for the cache of JIT-compiled code." };
|
|
new_values["CompiledExpressionCacheCount"] = { compiled_expression_cache->count(),
|
|
"Total entries in the cache of JIT-compiled code." };
|
|
}
|
|
#endif
|
|
|
|
new_values["Uptime"] = { getContext()->getUptimeSeconds(),
|
|
"The server uptime in seconds. It includes the time spent for server initialization before accepting connections." };
|
|
|
|
if (const auto stats = getHashTablesCacheStatistics())
|
|
{
|
|
new_values["HashTableStatsCacheEntries"] = { stats->entries,
|
|
"The number of entries in the cache of hash table sizes."
|
|
" The cache for hash table sizes is used for predictive optimization of GROUP BY." };
|
|
new_values["HashTableStatsCacheHits"] = { stats->hits,
|
|
"The number of times the prediction of a hash table size was correct." };
|
|
new_values["HashTableStatsCacheMisses"] = { stats->misses,
|
|
"The number of times the prediction of a hash table size was incorrect." };
|
|
}
|
|
|
|
/// Free space in filesystems at data path and logs path.
|
|
{
|
|
auto stat = getStatVFS(getContext()->getPath());
|
|
|
|
new_values["FilesystemMainPathTotalBytes"] = { stat.f_blocks * stat.f_frsize,
|
|
"The size of the volume where the main ClickHouse path is mounted, in bytes." };
|
|
new_values["FilesystemMainPathAvailableBytes"] = { stat.f_bavail * stat.f_frsize,
|
|
"Available bytes on the volume where the main ClickHouse path is mounted." };
|
|
new_values["FilesystemMainPathUsedBytes"] = { (stat.f_blocks - stat.f_bavail) * stat.f_frsize,
|
|
"Used bytes on the volume where the main ClickHouse path is mounted." };
|
|
new_values["FilesystemMainPathTotalINodes"] = { stat.f_files,
|
|
"The total number of inodes on the volume where the main ClickHouse path is mounted. If it is less than 25 million, it indicates a misconfiguration." };
|
|
new_values["FilesystemMainPathAvailableINodes"] = { stat.f_favail,
|
|
"The number of available inodes on the volume where the main ClickHouse path is mounted. If it is close to zero, it indicates a misconfiguration, and you will get 'no space left on device' even when the disk is not full." };
|
|
new_values["FilesystemMainPathUsedINodes"] = { stat.f_files - stat.f_favail,
|
|
"The number of used inodes on the volume where the main ClickHouse path is mounted. This value mostly corresponds to the number of files." };
|
|
}
|
|
|
|
{
|
|
/// Current working directory of the server is the directory with logs.
|
|
auto stat = getStatVFS(".");
|
|
|
|
new_values["FilesystemLogsPathTotalBytes"] = { stat.f_blocks * stat.f_frsize,
|
|
"The size of the volume where ClickHouse logs path is mounted, in bytes. It's recommended to have at least 10 GB for logs." };
|
|
new_values["FilesystemLogsPathAvailableBytes"] = { stat.f_bavail * stat.f_frsize,
|
|
"Available bytes on the volume where ClickHouse logs path is mounted. If this value approaches zero, you should tune the log rotation in the configuration file." };
|
|
new_values["FilesystemLogsPathUsedBytes"] = { (stat.f_blocks - stat.f_bavail) * stat.f_frsize,
|
|
"Used bytes on the volume where ClickHouse logs path is mounted." };
|
|
new_values["FilesystemLogsPathTotalINodes"] = { stat.f_files,
|
|
"The total number of inodes on the volume where ClickHouse logs path is mounted." };
|
|
new_values["FilesystemLogsPathAvailableINodes"] = { stat.f_favail,
|
|
"The number of available inodes on the volume where ClickHouse logs path is mounted." };
|
|
new_values["FilesystemLogsPathUsedINodes"] = { stat.f_files - stat.f_favail,
|
|
"The number of used inodes on the volume where ClickHouse logs path is mounted." };
|
|
}
|
|
|
|
/// Free and total space on every configured disk.
|
|
{
|
|
DisksMap disks_map = getContext()->getDisksMap();
|
|
for (const auto & [name, disk] : disks_map)
|
|
{
|
|
auto total = disk->getTotalSpace();
|
|
|
|
/// Some disks don't support information about the space.
|
|
if (!total)
|
|
continue;
|
|
|
|
auto available = disk->getAvailableSpace();
|
|
auto unreserved = disk->getUnreservedSpace();
|
|
|
|
new_values[fmt::format("DiskTotal_{}", name)] = { total,
|
|
"The total size in bytes of the disk (virtual filesystem). Remote filesystems can show a large value like 16 EiB." };
|
|
new_values[fmt::format("DiskUsed_{}", name)] = { total - available,
|
|
"Used bytes on the disk (virtual filesystem). Remote filesystems not always provide this information." };
|
|
new_values[fmt::format("DiskAvailable_{}", name)] = { available,
|
|
"Available bytes on the disk (virtual filesystem). Remote filesystems can show a large value like 16 EiB." };
|
|
new_values[fmt::format("DiskUnreserved_{}", name)] = { unreserved,
|
|
"Available bytes on the disk (virtual filesystem) without the reservations for merges, fetches, and moves. Remote filesystems can show a large value like 16 EiB." };
|
|
}
|
|
}
|
|
|
|
{
|
|
auto databases = DatabaseCatalog::instance().getDatabases();
|
|
|
|
size_t max_queue_size = 0;
|
|
size_t max_inserts_in_queue = 0;
|
|
size_t max_merges_in_queue = 0;
|
|
|
|
size_t sum_queue_size = 0;
|
|
size_t sum_inserts_in_queue = 0;
|
|
size_t sum_merges_in_queue = 0;
|
|
|
|
size_t max_absolute_delay = 0;
|
|
size_t max_relative_delay = 0;
|
|
|
|
size_t max_part_count_for_partition = 0;
|
|
|
|
size_t number_of_databases = 0;
|
|
for (auto [db_name, _] : databases)
|
|
if (db_name != DatabaseCatalog::TEMPORARY_DATABASE)
|
|
++number_of_databases; /// filter out the internal database for temporary tables, system table "system.databases" behaves the same way
|
|
|
|
size_t total_number_of_tables = 0;
|
|
|
|
size_t total_number_of_bytes = 0;
|
|
size_t total_number_of_rows = 0;
|
|
size_t total_number_of_parts = 0;
|
|
|
|
for (const auto & db : databases)
|
|
{
|
|
/// Check if database can contain MergeTree tables
|
|
if (!db.second->canContainMergeTreeTables())
|
|
continue;
|
|
|
|
for (auto iterator = db.second->getTablesIterator(getContext()); iterator->isValid(); iterator->next())
|
|
{
|
|
++total_number_of_tables;
|
|
const auto & table = iterator->table();
|
|
if (!table)
|
|
continue;
|
|
|
|
if (MergeTreeData * table_merge_tree = dynamic_cast<MergeTreeData *>(table.get()))
|
|
{
|
|
const auto & settings = getContext()->getSettingsRef();
|
|
|
|
calculateMax(max_part_count_for_partition, table_merge_tree->getMaxPartsCountAndSizeForPartition().first);
|
|
total_number_of_bytes += table_merge_tree->totalBytes(settings).value();
|
|
total_number_of_rows += table_merge_tree->totalRows(settings).value();
|
|
total_number_of_parts += table_merge_tree->getActivePartsCount();
|
|
}
|
|
|
|
if (StorageReplicatedMergeTree * table_replicated_merge_tree = typeid_cast<StorageReplicatedMergeTree *>(table.get()))
|
|
{
|
|
ReplicatedTableStatus status;
|
|
table_replicated_merge_tree->getStatus(status, false);
|
|
|
|
calculateMaxAndSum(max_queue_size, sum_queue_size, status.queue.queue_size);
|
|
calculateMaxAndSum(max_inserts_in_queue, sum_inserts_in_queue, status.queue.inserts_in_queue);
|
|
calculateMaxAndSum(max_merges_in_queue, sum_merges_in_queue, status.queue.merges_in_queue);
|
|
|
|
if (!status.is_readonly)
|
|
{
|
|
try
|
|
{
|
|
time_t absolute_delay = 0;
|
|
time_t relative_delay = 0;
|
|
table_replicated_merge_tree->getReplicaDelays(absolute_delay, relative_delay);
|
|
|
|
calculateMax(max_absolute_delay, absolute_delay);
|
|
calculateMax(max_relative_delay, relative_delay);
|
|
}
|
|
catch (...)
|
|
{
|
|
tryLogCurrentException(__PRETTY_FUNCTION__,
|
|
"Cannot get replica delay for table: " + backQuoteIfNeed(db.first) + "." + backQuoteIfNeed(iterator->name()));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
new_values["ReplicasMaxQueueSize"] = { max_queue_size, "Maximum queue size (in the number of operations like get, merge) across Replicated tables." };
|
|
new_values["ReplicasMaxInsertsInQueue"] = { max_inserts_in_queue, "Maximum number of INSERT operations in the queue (still to be replicated) across Replicated tables." };
|
|
new_values["ReplicasMaxMergesInQueue"] = { max_merges_in_queue, "Maximum number of merge operations in the queue (still to be applied) across Replicated tables." };
|
|
|
|
new_values["ReplicasSumQueueSize"] = { sum_queue_size, "Sum queue size (in the number of operations like get, merge) across Replicated tables." };
|
|
new_values["ReplicasSumInsertsInQueue"] = { sum_inserts_in_queue, "Sum of INSERT operations in the queue (still to be replicated) across Replicated tables." };
|
|
new_values["ReplicasSumMergesInQueue"] = { sum_merges_in_queue, "Sum of merge operations in the queue (still to be applied) across Replicated tables." };
|
|
|
|
new_values["ReplicasMaxAbsoluteDelay"] = { max_absolute_delay, "Maximum difference in seconds between the most fresh replicated part and the most fresh data part still to be replicated, across Replicated tables. A very high value indicates a replica with no data." };
|
|
new_values["ReplicasMaxRelativeDelay"] = { max_relative_delay, "Maximum difference between the replica delay and the delay of the most up-to-date replica of the same table, across Replicated tables." };
|
|
|
|
new_values["MaxPartCountForPartition"] = { max_part_count_for_partition, "Maximum number of parts per partition across all partitions of all tables of MergeTree family. Values larger than 300 indicates misconfiguration, overload, or massive data loading." };
|
|
|
|
new_values["NumberOfDatabases"] = { number_of_databases, "Total number of databases on the server." };
|
|
new_values["NumberOfTables"] = { total_number_of_tables, "Total number of tables summed across the databases on the server, excluding the databases that cannot contain MergeTree tables."
|
|
" The excluded database engines are those who generate the set of tables on the fly, like `Lazy`, `MySQL`, `PostgreSQL`, `SQlite`."};
|
|
|
|
new_values["TotalBytesOfMergeTreeTables"] = { total_number_of_bytes, "Total amount of bytes (compressed, including data and indices) stored in all tables of MergeTree family." };
|
|
new_values["TotalRowsOfMergeTreeTables"] = { total_number_of_rows, "Total amount of rows (records) stored in all tables of MergeTree family." };
|
|
new_values["TotalPartsOfMergeTreeTables"] = { total_number_of_parts, "Total amount of data parts in all tables of MergeTree family."
|
|
" Numbers larger than 10 000 will negatively affect the server startup time and it may indicate unreasonable choice of the partition key." };
|
|
}
|
|
|
|
#if USE_NURAFT
|
|
{
|
|
auto keeper_dispatcher = getContext()->tryGetKeeperDispatcher();
|
|
if (keeper_dispatcher)
|
|
updateKeeperInformation(*keeper_dispatcher, new_values);
|
|
}
|
|
#endif
|
|
|
|
updateHeavyMetricsIfNeeded(current_time, update_time, new_values);
|
|
}
|
|
|
|
void ServerAsynchronousMetrics::logImpl(AsynchronousMetricValues & new_values)
|
|
{
|
|
/// Log the new metrics.
|
|
if (auto asynchronous_metric_log = getContext()->getAsynchronousMetricLog())
|
|
asynchronous_metric_log->addValues(new_values);
|
|
}
|
|
|
|
void ServerAsynchronousMetrics::updateDetachedPartsStats()
|
|
{
|
|
DetachedPartsStats current_values{};
|
|
|
|
for (const auto & db : DatabaseCatalog::instance().getDatabases())
|
|
{
|
|
if (!db.second->canContainMergeTreeTables())
|
|
continue;
|
|
|
|
for (auto iterator = db.second->getTablesIterator(getContext()); iterator->isValid(); iterator->next())
|
|
{
|
|
const auto & table = iterator->table();
|
|
if (!table)
|
|
continue;
|
|
|
|
if (MergeTreeData * table_merge_tree = dynamic_cast<MergeTreeData *>(table.get()))
|
|
{
|
|
for (const auto & detached_part: table_merge_tree->getDetachedParts())
|
|
{
|
|
if (!detached_part.valid_name)
|
|
continue;
|
|
|
|
if (detached_part.prefix.empty())
|
|
++current_values.detached_by_user;
|
|
|
|
++current_values.count;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
detached_parts_stats = current_values;
|
|
}
|
|
|
|
void ServerAsynchronousMetrics::updateHeavyMetricsIfNeeded(TimePoint current_time, TimePoint update_time, AsynchronousMetricValues & new_values)
|
|
{
|
|
const auto time_after_previous_update = current_time - heavy_metric_previous_update_time;
|
|
const bool update_heavy_metric = time_after_previous_update >= heavy_metric_update_period || first_run;
|
|
|
|
Stopwatch watch;
|
|
if (update_heavy_metric)
|
|
{
|
|
heavy_metric_previous_update_time = update_time;
|
|
if (first_run)
|
|
heavy_update_interval = heavy_metric_update_period.count();
|
|
else
|
|
heavy_update_interval = std::chrono::duration_cast<std::chrono::microseconds>(time_after_previous_update).count() / 1e6;
|
|
|
|
/// Test shows that listing 100000 entries consuming around 0.15 sec.
|
|
updateDetachedPartsStats();
|
|
|
|
watch.stop();
|
|
|
|
/// Normally heavy metrics don't delay the rest of the metrics calculation
|
|
/// otherwise log the warning message
|
|
auto log_level = std::make_pair(DB::LogsLevel::trace, Poco::Message::PRIO_TRACE);
|
|
if (watch.elapsedSeconds() > (update_period.count() / 2.))
|
|
log_level = std::make_pair(DB::LogsLevel::debug, Poco::Message::PRIO_DEBUG);
|
|
else if (watch.elapsedSeconds() > (update_period.count() / 4. * 3))
|
|
log_level = std::make_pair(DB::LogsLevel::warning, Poco::Message::PRIO_WARNING);
|
|
LOG_IMPL(log, log_level.first, log_level.second,
|
|
"Update heavy metrics. "
|
|
"Update period {} sec. "
|
|
"Update heavy metrics period {} sec. "
|
|
"Heavy metrics calculation elapsed: {} sec.",
|
|
update_period.count(),
|
|
heavy_metric_update_period.count(),
|
|
watch.elapsedSeconds());
|
|
|
|
}
|
|
new_values["AsynchronousHeavyMetricsCalculationTimeSpent"] = { watch.elapsedSeconds(), "Time in seconds spent for calculation of asynchronous heavy (tables related) metrics (this is the overhead of asynchronous metrics)." };
|
|
|
|
new_values["AsynchronousHeavyMetricsUpdateInterval"] = { heavy_update_interval, "Heavy (tables related) metrics update interval" };
|
|
|
|
new_values["NumberOfDetachedParts"] = { detached_parts_stats.count, "The total number of parts detached from MergeTree tables. A part can be detached by a user with the `ALTER TABLE DETACH` query or by the server itself it the part is broken, unexpected or unneeded. The server does not care about detached parts and they can be removed." };
|
|
new_values["NumberOfDetachedByUserParts"] = { detached_parts_stats.detached_by_user, "The total number of parts detached from MergeTree tables by users with the `ALTER TABLE DETACH` query (as opposed to unexpected, broken or ignored parts). The server does not care about detached parts and they can be removed." };
|
|
}
|
|
|
|
}
|