#include #include #include #include #include #include #include #include #include namespace ProfileEvents { extern const Event ReadBufferFromFileDescriptorRead; extern const Event ReadBufferFromFileDescriptorReadFailed; extern const Event ReadBufferFromFileDescriptorReadBytes; extern const Event DiskReadElapsedMicroseconds; extern const Event Seek; } namespace CurrentMetrics { extern const Metric Read; } namespace DB { namespace ErrorCodes { extern const int CANNOT_READ_FROM_FILE_DESCRIPTOR; extern const int ARGUMENT_OUT_OF_BOUND; extern const int CANNOT_SEEK_THROUGH_FILE; extern const int CANNOT_SELECT; } std::string ReadBufferFromFileDescriptor::getFileName() const { return "(fd = " + toString(fd) + ")"; } bool ReadBufferFromFileDescriptor::nextImpl() { size_t bytes_read = 0; while (!bytes_read) { ProfileEvents::increment(ProfileEvents::ReadBufferFromFileDescriptorRead); Stopwatch watch(profile_callback ? clock_type : CLOCK_MONOTONIC); ssize_t res = 0; { CurrentMetrics::Increment metric_increment{CurrentMetrics::Read}; res = ::read(fd, internal_buffer.begin(), internal_buffer.size()); } if (!res) break; if (-1 == res && errno != EINTR) { ProfileEvents::increment(ProfileEvents::ReadBufferFromFileDescriptorReadFailed); throwFromErrnoWithPath("Cannot read from file " + getFileName(), getFileName(), ErrorCodes::CANNOT_READ_FROM_FILE_DESCRIPTOR); } if (res > 0) bytes_read += res; /// It reports real time spent including the time spent while thread was preempted doing nothing. /// And it is Ok for the purpose of this watch (it is used to lower the number of threads to read from tables). /// Sometimes it is better to use taskstats::blkio_delay_total, but it is quite expensive to get it (TaskStatsInfoGetter has about 500K RPS). watch.stop(); ProfileEvents::increment(ProfileEvents::DiskReadElapsedMicroseconds, watch.elapsedMicroseconds()); if (profile_callback) { ProfileInfo info; info.bytes_requested = internal_buffer.size(); info.bytes_read = res; info.nanoseconds = watch.elapsed(); profile_callback(info); } } pos_in_file += bytes_read; if (bytes_read) { ProfileEvents::increment(ProfileEvents::ReadBufferFromFileDescriptorReadBytes, bytes_read); working_buffer.resize(bytes_read); } else return false; return true; } /// If 'offset' is small enough to stay in buffer after seek, then true seek in file does not happen. off_t ReadBufferFromFileDescriptor::doSeek(off_t offset, int whence) { off_t new_pos = offset; if (whence == SEEK_CUR) new_pos = pos_in_file - (working_buffer.end() - pos) + offset; else if (whence != SEEK_SET) throw Exception("ReadBufferFromFileDescriptor::seek expects SEEK_SET or SEEK_CUR as whence", ErrorCodes::ARGUMENT_OUT_OF_BOUND); /// Position is unchanged. if (new_pos + (working_buffer.end() - pos) == pos_in_file) return new_pos; if (hasPendingData() && new_pos <= pos_in_file && new_pos >= pos_in_file - static_cast(working_buffer.size())) { /// Position is still inside buffer. pos = working_buffer.begin() + (new_pos - (pos_in_file - working_buffer.size())); return new_pos; } else { ProfileEvents::increment(ProfileEvents::Seek); Stopwatch watch(profile_callback ? clock_type : CLOCK_MONOTONIC); pos = working_buffer.end(); off_t res = ::lseek(fd, new_pos, SEEK_SET); if (-1 == res) throwFromErrnoWithPath("Cannot seek through file " + getFileName(), getFileName(), ErrorCodes::CANNOT_SEEK_THROUGH_FILE); pos_in_file = new_pos; watch.stop(); ProfileEvents::increment(ProfileEvents::DiskReadElapsedMicroseconds, watch.elapsedMicroseconds()); return res; } } /// Assuming file descriptor supports 'select', check that we have data to read or wait until timeout. bool ReadBufferFromFileDescriptor::poll(size_t timeout_microseconds) { fd_set fds; FD_ZERO(&fds); FD_SET(fd, &fds); timeval timeout = { time_t(timeout_microseconds / 1000000), suseconds_t(timeout_microseconds % 1000000) }; int res = select(1, &fds, nullptr, nullptr, &timeout); if (-1 == res) throwFromErrno("Cannot select", ErrorCodes::CANNOT_SELECT); return res > 0; } }