#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /// Basic idea is motivated by "iotop" tool. /// More info: https://www.kernel.org/doc/Documentation/accounting/taskstats.txt #define GENLMSG_DATA(glh) ((void *)((char*)NLMSG_DATA(glh) + GENL_HDRLEN)) #define GENLMSG_PAYLOAD(glh) (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN) #define NLA_DATA(na) ((void *)((char*)(na) + NLA_HDRLEN)) #define NLA_PAYLOAD(len) (len - NLA_HDRLEN) namespace DB { namespace ErrorCodes { extern const int NETLINK_ERROR; } namespace { static size_t constexpr MAX_MSG_SIZE = 1024; struct NetlinkMessage { ::nlmsghdr n; ::genlmsghdr g; char buf[MAX_MSG_SIZE]; }; void sendCommand( int sock_fd, UInt16 nlmsg_type, UInt32 nlmsg_pid, UInt8 genl_cmd, UInt16 nla_type, void * nla_data, int nla_len) { NetlinkMessage msg{}; msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); msg.n.nlmsg_type = nlmsg_type; msg.n.nlmsg_flags = NLM_F_REQUEST; msg.n.nlmsg_seq = 0; msg.n.nlmsg_pid = nlmsg_pid; msg.g.cmd = genl_cmd; msg.g.version = 1; ::nlattr * attr = static_cast<::nlattr *>(GENLMSG_DATA(&msg)); attr->nla_type = nla_type; attr->nla_len = nla_len + 1 + NLA_HDRLEN; memcpy(NLA_DATA(attr), nla_data, nla_len); msg.n.nlmsg_len += NLMSG_ALIGN(attr->nla_len); char * buf = reinterpret_cast(&msg); ssize_t buflen = msg.n.nlmsg_len; ::sockaddr_nl nladdr{}; nladdr.nl_family = AF_NETLINK; while (true) { ssize_t r = ::sendto(sock_fd, buf, buflen, 0, reinterpret_cast(&nladdr), sizeof(nladdr)); if (r >= buflen) break; if (r > 0) { buf += r; buflen -= r; } else if (errno != EAGAIN) throwFromErrno("Can't send a Netlink command", ErrorCodes::NETLINK_ERROR); } } UInt16 getFamilyId(int nl_sock_fd) { struct { ::nlmsghdr header; ::genlmsghdr ge_header; char buf[256]; } answer; static char name[] = TASKSTATS_GENL_NAME; sendCommand( nl_sock_fd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY, CTRL_ATTR_FAMILY_NAME, (void *) name, strlen(TASKSTATS_GENL_NAME) + 1); UInt16 id = 0; ssize_t rep_len = ::recv(nl_sock_fd, &answer, sizeof(answer), 0); if (rep_len < 0) throwFromErrno("Cannot get the family id for " + std::string(TASKSTATS_GENL_NAME) + " from the Netlink socket", ErrorCodes::NETLINK_ERROR); if (answer.header.nlmsg_type == NLMSG_ERROR ||!NLMSG_OK((&answer.header), rep_len)) throw Exception("Received an error instead of the family id for " + std::string(TASKSTATS_GENL_NAME) + " from the Netlink socket", ErrorCodes::NETLINK_ERROR); const ::nlattr * attr; attr = static_cast(GENLMSG_DATA(&answer)); attr = reinterpret_cast(reinterpret_cast(attr) + NLA_ALIGN(attr->nla_len)); if (attr->nla_type == CTRL_ATTR_FAMILY_ID) id = *static_cast(NLA_DATA(attr)); return id; } } void TaskStatsInfoGetter::init() { if (netlink_socket_fd >= 0) return; netlink_socket_fd = ::socket(PF_NETLINK, SOCK_RAW, NETLINK_GENERIC); if (netlink_socket_fd < 0) throwFromErrno("Can't create PF_NETLINK socket", ErrorCodes::NETLINK_ERROR); struct timeval tv; tv.tv_sec = 0; tv.tv_usec = 50000; if (0 != ::setsockopt(netlink_socket_fd, SOL_SOCKET, SO_RCVTIMEO, reinterpret_cast(&tv), sizeof(tv))) throwFromErrno("Can't set timeout on PF_NETLINK socket", ErrorCodes::NETLINK_ERROR); ::sockaddr_nl addr{}; addr.nl_family = AF_NETLINK; if (::bind(netlink_socket_fd, reinterpret_cast(&addr), sizeof(addr)) < 0) throwFromErrno("Can't bind PF_NETLINK socket", ErrorCodes::NETLINK_ERROR); netlink_family_id = getFamilyId(netlink_socket_fd); } void TaskStatsInfoGetter::getStat(::taskstats & out_stats, pid_t tid) { init(); sendCommand(netlink_socket_fd, netlink_family_id, tid, TASKSTATS_CMD_GET, TASKSTATS_CMD_ATTR_PID, &tid, sizeof(pid_t)); NetlinkMessage msg; ssize_t rv = ::recv(netlink_socket_fd, &msg, sizeof(msg), 0); if (msg.n.nlmsg_type == NLMSG_ERROR || !NLMSG_OK((&msg.n), rv)) { const ::nlmsgerr * err = static_cast(NLMSG_DATA(&msg)); throw Exception("Can't get Netlink response, error: " + std::to_string(err->error), ErrorCodes::NETLINK_ERROR); } rv = GENLMSG_PAYLOAD(&msg.n); const ::nlattr * attr = static_cast(GENLMSG_DATA(&msg)); ssize_t len = 0; while (len < rv) { len += NLA_ALIGN(attr->nla_len); if (attr->nla_type == TASKSTATS_TYPE_AGGR_TGID || attr->nla_type == TASKSTATS_TYPE_AGGR_PID) { int aggr_len = NLA_PAYLOAD(attr->nla_len); int len2 = 0; attr = static_cast(NLA_DATA(attr)); while (len2 < aggr_len) { if (attr->nla_type == TASKSTATS_TYPE_STATS) { const ::taskstats * ts = static_cast(NLA_DATA(attr)); out_stats = *ts; } len2 += NLA_ALIGN(attr->nla_len); attr = reinterpret_cast(reinterpret_cast(attr) + len2); } } attr = reinterpret_cast(reinterpret_cast(GENLMSG_DATA(&msg)) + len); } } pid_t TaskStatsInfoGetter::getCurrentTID() { /// This call is always successful. - man gettid return static_cast(syscall(SYS_gettid)); } static bool checkPermissionsImpl() { /// See man getcap. __user_cap_header_struct request{}; request.version = _LINUX_CAPABILITY_VERSION_1; /// It's enough to check just single CAP_NET_ADMIN capability we are interested. request.pid = getpid(); __user_cap_data_struct response{}; /// Avoid dependency on 'libcap'. if (0 != syscall(SYS_capget, &request, &response)) throwFromErrno("Cannot do 'capget' syscall", ErrorCodes::NETLINK_ERROR); return (1 << CAP_NET_ADMIN) & response.effective; } bool TaskStatsInfoGetter::checkPermissions() { /// It is thread- and exception- safe since C++11 static bool res = checkPermissionsImpl(); return res; } TaskStatsInfoGetter::~TaskStatsInfoGetter() { if (netlink_socket_fd >= 0) close(netlink_socket_fd); } }