2012-01-30 19:18:25 +00:00
|
|
|
#pragma once
|
|
|
|
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Core/Defines.h>
|
2012-01-30 19:18:25 +00:00
|
|
|
#include <algorithm>
|
2011-06-27 18:22:14 +00:00
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
|
2017-05-28 14:29:40 +00:00
|
|
|
/** Base class for ReadBuffer and WriteBuffer.
|
2017-08-07 07:31:16 +00:00
|
|
|
* Contains common types, variables, and functions.
|
2011-06-30 15:21:35 +00:00
|
|
|
*
|
2017-05-28 14:29:40 +00:00
|
|
|
* ReadBuffer and WriteBuffer are similar to istream and ostream, respectively.
|
|
|
|
* They have to be used, because using iostreams it is impossible to effectively implement some operations.
|
|
|
|
* For example, using istream, you can not quickly read string values from a tab-separated file,
|
|
|
|
* so that after reading, the position remains immediately after the read value.
|
|
|
|
* (The only option is to call the std::istream::get() function on each byte, but this slows down due to several virtual calls.)
|
2011-06-30 15:21:35 +00:00
|
|
|
*
|
2017-05-28 14:29:40 +00:00
|
|
|
* Read/WriteBuffers provide direct access to the internal buffer, so the necessary operations are implemented more efficiently.
|
|
|
|
* Only one virtual function nextImpl() is used, which is rarely called:
|
|
|
|
* - in the case of ReadBuffer - fill in the buffer with new data from the source;
|
|
|
|
* - in the case of WriteBuffer - write data from the buffer into the receiver.
|
2011-06-30 15:21:35 +00:00
|
|
|
*
|
2017-05-28 14:29:40 +00:00
|
|
|
* Read/WriteBuffer can own or not own an own piece of memory.
|
|
|
|
* In the second case, you can effectively read from an already existing piece of memory / std::string without copying it.
|
2011-06-27 18:22:14 +00:00
|
|
|
*/
|
|
|
|
class BufferBase
|
|
|
|
{
|
|
|
|
public:
|
2017-05-28 14:29:40 +00:00
|
|
|
/** Cursor in the buffer. The position of write or read. */
|
2017-04-01 07:20:54 +00:00
|
|
|
using Position = char *;
|
|
|
|
|
2017-05-28 14:29:40 +00:00
|
|
|
/** A reference to the range of memory. */
|
2017-04-01 07:20:54 +00:00
|
|
|
struct Buffer
|
|
|
|
{
|
|
|
|
Buffer(Position begin_pos_, Position end_pos_) : begin_pos(begin_pos_), end_pos(end_pos_) {}
|
|
|
|
|
|
|
|
inline Position begin() const { return begin_pos; }
|
|
|
|
inline Position end() const { return end_pos; }
|
2018-06-03 16:51:31 +00:00
|
|
|
inline size_t size() const { return size_t(end_pos - begin_pos); }
|
2017-04-01 07:20:54 +00:00
|
|
|
inline void resize(size_t size) { end_pos = begin_pos + size; }
|
2021-02-03 12:31:45 +00:00
|
|
|
inline bool empty() const { return size() == 0; }
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
inline void swap(Buffer & other)
|
|
|
|
{
|
|
|
|
std::swap(begin_pos, other.begin_pos);
|
|
|
|
std::swap(end_pos, other.end_pos);
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
Position begin_pos;
|
2017-05-28 14:29:40 +00:00
|
|
|
Position end_pos; /// 1 byte after the end of the buffer
|
2017-04-01 07:20:54 +00:00
|
|
|
};
|
|
|
|
|
2017-05-28 14:29:40 +00:00
|
|
|
/** The constructor takes a range of memory to use for the buffer.
|
|
|
|
* offset - the starting point of the cursor. ReadBuffer must set it to the end of the range, and WriteBuffer - to the beginning.
|
2017-04-01 07:20:54 +00:00
|
|
|
*/
|
|
|
|
BufferBase(Position ptr, size_t size, size_t offset)
|
2018-01-13 19:29:10 +00:00
|
|
|
: pos(ptr + offset), working_buffer(ptr, ptr + size), internal_buffer(ptr, ptr + size) {}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
void set(Position ptr, size_t size, size_t offset)
|
|
|
|
{
|
|
|
|
internal_buffer = Buffer(ptr, ptr + size);
|
|
|
|
working_buffer = Buffer(ptr, ptr + size);
|
|
|
|
pos = ptr + offset;
|
|
|
|
}
|
|
|
|
|
2017-05-28 14:29:40 +00:00
|
|
|
/// get buffer
|
2017-04-01 07:20:54 +00:00
|
|
|
inline Buffer & internalBuffer() { return internal_buffer; }
|
|
|
|
|
2017-05-28 14:29:40 +00:00
|
|
|
/// get the part of the buffer from which you can read / write data
|
2017-04-01 07:20:54 +00:00
|
|
|
inline Buffer & buffer() { return working_buffer; }
|
|
|
|
|
2017-05-28 14:29:40 +00:00
|
|
|
/// get (for reading and modifying) the position in the buffer
|
2018-06-03 16:51:31 +00:00
|
|
|
inline Position & position() { return pos; }
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-05-28 14:29:40 +00:00
|
|
|
/// offset in bytes of the cursor from the beginning of the buffer
|
2018-06-03 16:51:31 +00:00
|
|
|
inline size_t offset() const { return size_t(pos - working_buffer.begin()); }
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-07-17 16:29:11 +00:00
|
|
|
/// How many bytes are available for read/write
|
2018-07-17 12:32:31 +00:00
|
|
|
inline size_t available() const { return size_t(working_buffer.end() - pos); }
|
|
|
|
|
2018-08-25 11:57:00 +00:00
|
|
|
inline void swap(BufferBase & other)
|
|
|
|
{
|
|
|
|
internal_buffer.swap(other.internal_buffer);
|
|
|
|
working_buffer.swap(other.working_buffer);
|
|
|
|
std::swap(pos, other.pos);
|
|
|
|
}
|
|
|
|
|
2017-05-28 14:29:40 +00:00
|
|
|
/** How many bytes have been read/written, counting those that are still in the buffer. */
|
2019-09-04 21:25:33 +00:00
|
|
|
size_t count() const { return bytes + offset(); }
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
/** Check that there is more bytes in buffer after cursor. */
|
2019-09-04 21:25:33 +00:00
|
|
|
bool ALWAYS_INLINE hasPendingData() const { return available() > 0; }
|
2015-02-07 23:13:04 +00:00
|
|
|
|
2019-09-04 21:25:33 +00:00
|
|
|
bool isPadded() const { return padded; }
|
Padding for IO buffers.
Testing data
```
select 'aaaaaaaa','bbbbbbbb','cccccccc','dddddddd','eeeeeeee','ffffffff','gggg','hhh' from numbers(3000000) into outfile '/tmp/test.tsv'
```
Testing command
```
echo "select count() from file('/tmp/test.tsv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
TSV parser has less overhead than CSV, using it would better unveil the benefits of memcpySmall.
Before
```
QPS: 1.662, RPS: 4985463.906, MiB/s: 603.823, result RPS: 1.662, result MiB/s: 0.000.
0.000% 0.559 sec.
10.000% 0.564 sec.
20.000% 0.568 sec.
30.000% 0.572 sec.
40.000% 0.575 sec.
50.000% 0.581 sec.
60.000% 0.592 sec.
70.000% 0.624 sec.
80.000% 0.639 sec.
90.000% 0.664 sec.
95.000% 0.686 sec.
99.000% 0.711 sec.
99.900% 0.715 sec.
99.990% 0.716 sec.
```
After
```
QPS: 1.861, RPS: 5582303.107, MiB/s: 676.110, result RPS: 1.861, result MiB/s: 0.000.
0.000% 0.510 sec.
10.000% 0.514 sec.
20.000% 0.517 sec.
30.000% 0.521 sec.
40.000% 0.523 sec.
50.000% 0.527 sec.
60.000% 0.530 sec.
70.000% 0.539 sec.
80.000% 0.558 sec.
90.000% 0.584 sec.
95.000% 0.589 sec.
99.000% 0.608 sec.
99.900% 0.655 sec.
99.990% 0.663 sec.
```
2018-08-27 19:14:15 +00:00
|
|
|
|
2011-06-27 18:22:14 +00:00
|
|
|
protected:
|
2022-02-07 14:20:26 +00:00
|
|
|
void resetWorkingBuffer()
|
|
|
|
{
|
|
|
|
/// Move position to the end of buffer to trigger call of 'next' on next reading.
|
2022-02-07 14:49:53 +00:00
|
|
|
/// Discard all data in current working buffer to prevent wrong assumptions on content
|
2022-02-07 14:20:26 +00:00
|
|
|
/// of buffer, e.g. for optimizations of seeks in seekable buffers.
|
|
|
|
working_buffer.resize(0);
|
|
|
|
pos = working_buffer.end();
|
|
|
|
}
|
|
|
|
|
2018-01-13 19:29:10 +00:00
|
|
|
/// Read/write position.
|
|
|
|
Position pos;
|
|
|
|
|
|
|
|
/** How many bytes have been read/written, not counting those that are now in the buffer.
|
|
|
|
* (counting those that were already used and "removed" from the buffer)
|
|
|
|
*/
|
|
|
|
size_t bytes = 0;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-05-28 14:29:40 +00:00
|
|
|
/** A piece of memory that you can use.
|
|
|
|
* For example, if internal_buffer is 1MB, and from a file for reading it was loaded into the buffer
|
|
|
|
* only 10 bytes, then working_buffer will be 10 bytes in size
|
|
|
|
* (working_buffer.end() will point to the position immediately after the 10 bytes that can be read).
|
2017-04-01 07:20:54 +00:00
|
|
|
*/
|
|
|
|
Buffer working_buffer;
|
|
|
|
|
2018-01-13 19:29:10 +00:00
|
|
|
/// A reference to a piece of memory for the buffer.
|
|
|
|
Buffer internal_buffer;
|
Padding for IO buffers.
Testing data
```
select 'aaaaaaaa','bbbbbbbb','cccccccc','dddddddd','eeeeeeee','ffffffff','gggg','hhh' from numbers(3000000) into outfile '/tmp/test.tsv'
```
Testing command
```
echo "select count() from file('/tmp/test.tsv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
TSV parser has less overhead than CSV, using it would better unveil the benefits of memcpySmall.
Before
```
QPS: 1.662, RPS: 4985463.906, MiB/s: 603.823, result RPS: 1.662, result MiB/s: 0.000.
0.000% 0.559 sec.
10.000% 0.564 sec.
20.000% 0.568 sec.
30.000% 0.572 sec.
40.000% 0.575 sec.
50.000% 0.581 sec.
60.000% 0.592 sec.
70.000% 0.624 sec.
80.000% 0.639 sec.
90.000% 0.664 sec.
95.000% 0.686 sec.
99.000% 0.711 sec.
99.900% 0.715 sec.
99.990% 0.716 sec.
```
After
```
QPS: 1.861, RPS: 5582303.107, MiB/s: 676.110, result RPS: 1.861, result MiB/s: 0.000.
0.000% 0.510 sec.
10.000% 0.514 sec.
20.000% 0.517 sec.
30.000% 0.521 sec.
40.000% 0.523 sec.
50.000% 0.527 sec.
60.000% 0.530 sec.
70.000% 0.539 sec.
80.000% 0.558 sec.
90.000% 0.584 sec.
95.000% 0.589 sec.
99.000% 0.608 sec.
99.900% 0.655 sec.
99.990% 0.663 sec.
```
2018-08-27 19:14:15 +00:00
|
|
|
|
|
|
|
/// Indicator of 15 bytes pad_right
|
|
|
|
bool padded{false};
|
2011-06-27 18:22:14 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
}
|