ClickHouse/src/IO/ReadBuffer.h

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

282 lines
9.5 KiB
C++
Raw Normal View History

2011-08-09 15:57:33 +00:00
#pragma once
2010-06-04 18:25:25 +00:00
2020-07-31 14:53:41 +00:00
#include <cassert>
2010-06-04 18:25:25 +00:00
#include <cstring>
2011-08-09 15:57:33 +00:00
#include <algorithm>
#include <memory>
2010-06-04 18:25:25 +00:00
#include <Common/Exception.h>
#include <Common/Priority.h>
#include <IO/BufferBase.h>
2022-06-06 04:58:22 +00:00
#include <IO/AsynchronousReader.h>
2010-06-04 18:25:25 +00:00
namespace DB
{
namespace ErrorCodes
{
extern const int ATTEMPT_TO_READ_AFTER_EOF;
extern const int CANNOT_READ_ALL_DATA;
}
static constexpr auto DEFAULT_PREFETCH_PRIORITY = Priority{0};
2023-02-07 17:50:31 +00:00
2017-05-28 14:29:40 +00:00
/** A simple abstract class for buffered data reading (char sequences) from somewhere.
* Unlike std::istream, it provides access to the internal buffer,
* and also allows you to manually manage the position inside the buffer.
2010-06-04 18:25:25 +00:00
*
2017-05-28 14:29:40 +00:00
* Note! `char *`, not `const char *` is used
* (so that you can take out the common code into BufferBase, and also so that you can fill the buffer in with new data).
* This causes inconveniences - for example, when using ReadBuffer to read from a chunk of memory const char *,
* you have to use const_cast.
2011-10-16 03:05:15 +00:00
*
2020-02-17 17:12:22 +00:00
* Derived classes must implement the nextImpl() method.
2010-06-04 18:25:25 +00:00
*/
class ReadBuffer : public BufferBase
2010-06-04 18:25:25 +00:00
{
public:
2017-05-28 14:29:40 +00:00
/** Creates a buffer and sets a piece of available data to read to zero size,
* so that the next() function is called to load the new data portion into the buffer at the first try.
2011-10-16 03:05:15 +00:00
*/
ReadBuffer(Position ptr, size_t size) : BufferBase(ptr, size, 0) { working_buffer.resize(0); }
2017-05-28 14:29:40 +00:00
/** Used when the buffer is already full of data that can be read.
* (in this case, pass 0 as an offset)
2011-10-16 03:05:15 +00:00
*/
ReadBuffer(Position ptr, size_t size, size_t offset) : BufferBase(ptr, size, offset) {}
2020-07-31 14:53:41 +00:00
// Copying the read buffers can be dangerous because they can hold a lot of
// memory or open files, so better to disable the copy constructor to prevent
// accidental copying.
ReadBuffer(const ReadBuffer &) = delete;
// FIXME: behavior differs greately from `BufferBase::set()` and it's very confusing.
void set(Position ptr, size_t size) { BufferBase::set(ptr, size, 0); working_buffer.resize(0); }
/** read next data and fill a buffer with it; set position to the beginning of the new data
* (but not necessarily to the beginning of working_buffer!);
* return `false` in case of end, `true` otherwise; throw an exception, if something is wrong;
*
* if an exception was thrown, is the ReadBuffer left in a usable state? this varies across implementations;
* can the caller retry next() after an exception, or call other methods? not recommended
2010-06-04 18:25:25 +00:00
*/
bool next()
{
2024-03-03 13:22:40 +00:00
chassert(!hasPendingData());
chassert(position() <= working_buffer.end());
2021-02-03 12:37:32 +00:00
bytes += offset();
bool res = nextImpl();
2011-06-26 21:30:59 +00:00
if (!res)
2024-03-03 13:22:40 +00:00
{
2021-02-04 14:46:46 +00:00
working_buffer = Buffer(pos, pos);
2024-03-03 13:22:40 +00:00
}
2021-02-04 14:46:46 +00:00
else
2021-11-26 12:03:46 +00:00
{
2024-03-03 13:22:40 +00:00
pos = working_buffer.begin() + std::min(nextimpl_working_buffer_offset, working_buffer.size());
chassert(position() < working_buffer.end());
2021-11-26 12:03:46 +00:00
}
2020-07-31 14:53:41 +00:00
nextimpl_working_buffer_offset = 0;
2024-03-03 13:22:40 +00:00
chassert(position() <= working_buffer.end());
return res;
}
inline void nextIfAtEnd()
{
if (!hasPendingData())
next();
}
2021-02-03 12:37:32 +00:00
virtual ~ReadBuffer() = default;
2017-05-28 14:29:40 +00:00
/** Unlike std::istream, it returns true if all data was read
* (and not in case there was an attempt to read after the end).
* If at the moment the position is at the end of the buffer, it calls the next() method.
* That is, it has a side effect - if the buffer is over, then it updates it and set the position to the beginning.
2010-06-04 18:25:25 +00:00
*
2017-05-28 14:29:40 +00:00
* Try to read after the end should throw an exception.
2010-06-04 18:25:25 +00:00
*/
bool ALWAYS_INLINE eof()
2010-06-04 18:25:25 +00:00
{
return !hasPendingData() && !next();
2010-06-04 18:25:25 +00:00
}
2010-06-04 18:25:25 +00:00
void ignore()
{
if (!eof())
++pos;
else
2019-02-21 15:49:42 +00:00
throwReadAfterEOF();
2010-06-04 18:25:25 +00:00
}
void ignore(size_t n)
{
while (n != 0 && !eof())
{
size_t bytes_to_ignore = std::min(static_cast<size_t>(working_buffer.end() - pos), n);
pos += bytes_to_ignore;
n -= bytes_to_ignore;
}
if (n)
2019-02-21 15:49:42 +00:00
throwReadAfterEOF();
}
2017-05-28 14:29:40 +00:00
/// You could call this method `ignore`, and `ignore` call `ignoreStrict`.
size_t tryIgnore(size_t n)
{
size_t bytes_ignored = 0;
while (bytes_ignored < n && !eof())
{
size_t bytes_to_ignore = std::min(static_cast<size_t>(working_buffer.end() - pos), n - bytes_ignored);
pos += bytes_to_ignore;
bytes_ignored += bytes_to_ignore;
}
return bytes_ignored;
}
2020-09-08 16:28:49 +00:00
void ignoreAll()
{
tryIgnore(std::numeric_limits<size_t>::max());
}
/// Peeks a single byte.
bool ALWAYS_INLINE peek(char & c)
2019-02-21 15:49:42 +00:00
{
if (eof())
return false;
c = *pos;
2019-02-21 15:49:42 +00:00
return true;
}
/// Reads a single byte.
2022-11-11 09:56:18 +00:00
[[nodiscard]] bool ALWAYS_INLINE read(char & c)
{
if (peek(c))
{
++pos;
return true;
}
return false;
}
2019-02-21 15:49:42 +00:00
void ALWAYS_INLINE readStrict(char & c)
{
if (read(c))
return;
throwReadAfterEOF();
}
2017-05-28 14:29:40 +00:00
/** Reads as many as there are, no more than n bytes. */
2022-11-11 09:56:18 +00:00
[[nodiscard]] size_t read(char * to, size_t n)
2010-06-04 18:25:25 +00:00
{
size_t bytes_copied = 0;
2012-03-25 07:52:31 +00:00
while (bytes_copied < n && !eof())
2010-06-04 18:25:25 +00:00
{
size_t bytes_to_copy = std::min(static_cast<size_t>(working_buffer.end() - pos), n - bytes_copied);
::memcpy(to + bytes_copied, pos, bytes_to_copy);
2010-06-04 18:25:25 +00:00
pos += bytes_to_copy;
2010-06-04 19:06:32 +00:00
bytes_copied += bytes_to_copy;
2010-06-04 18:25:25 +00:00
}
2010-06-04 18:25:25 +00:00
return bytes_copied;
}
2017-05-28 14:29:40 +00:00
/** Reads n bytes, if there are less - throws an exception. */
2010-06-04 18:25:25 +00:00
void readStrict(char * to, size_t n)
{
auto read_bytes = read(to, n);
if (n != read_bytes)
throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA,
"Cannot read all data. Bytes read: {}. Bytes expected: {}.", read_bytes, std::to_string(n));
2010-06-04 18:25:25 +00:00
}
2020-02-17 17:12:22 +00:00
/** A method that can be more efficiently implemented in derived classes, in the case of reading large enough blocks.
2017-05-28 14:29:40 +00:00
* The implementation can read data directly into `to`, without superfluous copying, if in `to` there is enough space for work.
* For example, a CompressedReadBuffer can decompress the data directly into `to`, if the entire decompressed block fits there.
* By default - the same as read.
* Don't use for small reads.
*/
2022-11-11 09:56:18 +00:00
[[nodiscard]] virtual size_t readBig(char * to, size_t n) { return read(to, n); }
2021-07-26 00:34:36 +00:00
/** Do something to allow faster subsequent call to 'nextImpl' if possible.
* It's used for asynchronous readers with double-buffering.
* `priority` is the `ThreadPool` priority, with which the prefetch task will be scheduled.
* Lower value means higher priority.
2021-07-26 00:34:36 +00:00
*/
virtual void prefetch(Priority) {}
2021-03-04 11:10:21 +00:00
2021-10-21 17:43:27 +00:00
/**
* Set upper bound for read range [..., position).
2023-03-13 19:29:59 +00:00
* Useful for reading from remote filesystem, when it matters how much we read.
* Doesn't affect getFileSize().
* See also: SeekableReadBuffer::supportsRightBoundedReads().
*
* Behavior in weird cases is currently implementation-defined:
* - setReadUntilPosition() below current position,
* - setReadUntilPosition() above the end of the file,
* - seek() to a position above the until position (even if you setReadUntilPosition() to a
* higher value right after the seek!),
2023-03-13 19:29:59 +00:00
*
* Implementations are recommended to:
* - Allow the read-until-position to go below current position, e.g.:
* // Read block [300, 400)
* setReadUntilPosition(400);
* seek(300);
* next();
* // Read block [100, 200)
* setReadUntilPosition(200); // oh oh, this is below the current position, but should be allowed
* seek(100); // but now everything's fine again
* next();
* // (Swapping the order of seek and setReadUntilPosition doesn't help: then it breaks if the order of blocks is reversed.)
* - Check if new read-until-position value is equal to the current value and do nothing in this case,
* so that the caller doesn't have to.
2023-03-13 19:29:59 +00:00
*
* Typical implementations discard any current buffers and connections when the
* read-until-position changes even by a small (nonzero) amount.
2021-10-21 17:43:27 +00:00
*/
virtual void setReadUntilPosition(size_t /* position */) {}
2021-10-21 17:43:27 +00:00
virtual void setReadUntilEnd() {}
protected:
2020-07-31 14:53:41 +00:00
/// The number of bytes to ignore from the initial position of `working_buffer`
/// buffer. Apparently this is an additional out-parameter for nextImpl(),
/// not a real field.
size_t nextimpl_working_buffer_offset = 0;
2011-05-13 19:40:56 +00:00
private:
2017-05-28 14:29:40 +00:00
/** Read the next data and fill a buffer with it.
* Return `false` in case of the end, `true` otherwise.
* Throw an exception if something is wrong.
*/
virtual bool nextImpl() { return false; }
2019-02-21 15:49:42 +00:00
2021-02-06 22:31:20 +00:00
[[noreturn]] static void throwReadAfterEOF()
2019-02-21 15:49:42 +00:00
{
throw Exception(ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF, "Attempt to read after eof");
2019-02-21 15:49:42 +00:00
}
2010-06-04 18:25:25 +00:00
};
using ReadBufferPtr = std::shared_ptr<ReadBuffer>;
/// Due to inconsistencies in ReadBuffer-family interfaces:
/// - some require to fully wrap underlying buffer and own it,
/// - some just wrap the reference without ownership,
/// we need to be able to wrap reference-only buffers with movable transparent proxy-buffer.
/// The uniqueness of such wraps is responsibility of the code author.
std::unique_ptr<ReadBuffer> wrapReadBufferReference(ReadBuffer & ref);
std::unique_ptr<ReadBuffer> wrapReadBufferPointer(ReadBufferPtr ptr);
2010-06-04 18:25:25 +00:00
}