2017-04-01 09:19:00 +00:00
|
|
|
#include <Core/Defines.h>
|
2023-03-06 14:53:58 +00:00
|
|
|
#include <base/hex.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Common/PODArray.h>
|
2018-01-15 19:07:47 +00:00
|
|
|
#include <Common/StringUtils/StringUtils.h>
|
2019-02-10 17:40:52 +00:00
|
|
|
#include <Common/memcpySmall.h>
|
2018-06-29 07:34:12 +00:00
|
|
|
#include <Formats/FormatSettings.h>
|
2017-04-16 05:40:17 +00:00
|
|
|
#include <IO/WriteBufferFromString.h>
|
2021-10-16 08:28:10 +00:00
|
|
|
#include <IO/BufferWithOwnMemory.h>
|
2018-01-13 04:45:13 +00:00
|
|
|
#include <IO/readFloatText.h>
|
2017-04-16 05:40:17 +00:00
|
|
|
#include <IO/Operators.h>
|
2021-10-02 07:13:14 +00:00
|
|
|
#include <base/find_symbols.h>
|
2022-05-08 17:01:47 +00:00
|
|
|
#include <cstdlib>
|
2022-07-31 14:34:05 +00:00
|
|
|
#include <bit>
|
2017-04-16 05:40:17 +00:00
|
|
|
|
2023-08-19 14:53:05 +00:00
|
|
|
#include <base/simd.h>
|
|
|
|
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __SSE2__
|
Enable sse2 for CSV parsing.
Testing data
```
select 'aaaaaaaa,bbbbbbbb,cccccccc,dddddddd,eeeeeeee,ffffffff,gggg,hhh' from numbers(3000000) into outfile '/tmp/test.csv'
```
Testing command
```
echo "select count() from file('/tmp/test.csv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
Before
```
QPS: 1.317, RPS: 3949749.687, MiB/s: 478.380, result RPS: 1.317, result MiB/s: 0.000.
0.000% 0.704 sec.
10.000% 0.712 sec.
20.000% 0.718 sec.
30.000% 0.726 sec.
40.000% 0.739 sec.
50.000% 0.754 sec.
60.000% 0.770 sec.
70.000% 0.788 sec.
80.000% 0.798 sec.
90.000% 0.815 sec.
95.000% 0.826 sec.
99.000% 0.850 sec.
99.900% 0.857 sec.
99.990% 0.858 sec.
```
After
```
QPS: 1.533, RPS: 4598308.336, MiB/s: 556.932, result RPS: 1.533, result MiB/s: 0.000.
0.000% 0.626 sec.
10.000% 0.635 sec.
20.000% 0.639 sec.
30.000% 0.642 sec.
40.000% 0.643 sec.
50.000% 0.645 sec.
60.000% 0.649 sec.
70.000% 0.652 sec.
80.000% 0.658 sec.
90.000% 0.682 sec.
95.000% 0.710 sec.
99.000% 0.727 sec.
99.900% 0.733 sec.
99.990% 0.734 sec.
```
2018-08-28 11:03:52 +00:00
|
|
|
#include <emmintrin.h>
|
|
|
|
#endif
|
|
|
|
|
2022-06-15 13:19:29 +00:00
|
|
|
#if defined(__aarch64__) && defined(__ARM_NEON)
|
|
|
|
# include <arm_neon.h>
|
2023-03-17 13:27:48 +00:00
|
|
|
# pragma clang diagnostic ignored "-Wreserved-identifier"
|
2022-06-15 13:19:29 +00:00
|
|
|
#endif
|
|
|
|
|
2010-06-04 18:25:25 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2016-01-11 21:46:36 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int CANNOT_PARSE_INPUT_ASSERTION_FAILED;
|
|
|
|
extern const int CANNOT_PARSE_ESCAPE_SEQUENCE;
|
|
|
|
extern const int CANNOT_PARSE_QUOTED_STRING;
|
2018-07-11 21:43:09 +00:00
|
|
|
extern const int CANNOT_PARSE_DATETIME;
|
|
|
|
extern const int CANNOT_PARSE_DATE;
|
2023-06-05 15:18:03 +00:00
|
|
|
extern const int CANNOT_PARSE_UUID;
|
2016-09-20 19:11:25 +00:00
|
|
|
extern const int INCORRECT_DATA;
|
2022-01-10 17:07:01 +00:00
|
|
|
extern const int ATTEMPT_TO_READ_AFTER_EOF;
|
2022-11-17 19:06:56 +00:00
|
|
|
extern const int LOGICAL_ERROR;
|
2022-12-08 20:46:22 +00:00
|
|
|
extern const int BAD_ARGUMENTS;
|
2016-01-11 21:46:36 +00:00
|
|
|
}
|
|
|
|
|
2022-12-30 14:16:20 +00:00
|
|
|
template <size_t num_bytes, typename IteratorSrc, typename IteratorDst>
|
|
|
|
inline void parseHex(IteratorSrc src, IteratorDst dst)
|
2020-06-23 03:14:16 +00:00
|
|
|
{
|
|
|
|
size_t src_pos = 0;
|
|
|
|
size_t dst_pos = 0;
|
|
|
|
for (; dst_pos < num_bytes; ++dst_pos, src_pos += 2)
|
|
|
|
dst[dst_pos] = unhex2(reinterpret_cast<const char *>(&src[src_pos]));
|
|
|
|
}
|
|
|
|
|
2023-06-02 19:36:37 +00:00
|
|
|
UUID parseUUID(std::span<const UInt8> src)
|
2020-06-23 03:14:16 +00:00
|
|
|
{
|
2023-06-02 19:36:37 +00:00
|
|
|
UUID uuid;
|
|
|
|
const auto * src_ptr = src.data();
|
2023-06-05 15:18:03 +00:00
|
|
|
const auto size = src.size();
|
2020-06-23 03:14:16 +00:00
|
|
|
|
2023-06-05 17:47:52 +00:00
|
|
|
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
2023-08-16 20:12:31 +00:00
|
|
|
const std::reverse_iterator dst(reinterpret_cast<UInt8 *>(&uuid) + sizeof(UUID));
|
|
|
|
#else
|
|
|
|
auto * dst = reinterpret_cast<UInt8 *>(&uuid);
|
2023-06-05 17:47:52 +00:00
|
|
|
#endif
|
2023-06-05 15:18:03 +00:00
|
|
|
if (size == 36)
|
2023-06-02 19:36:37 +00:00
|
|
|
{
|
2023-08-16 20:12:31 +00:00
|
|
|
parseHex<4>(src_ptr, dst + 8);
|
|
|
|
parseHex<2>(src_ptr + 9, dst + 12);
|
|
|
|
parseHex<2>(src_ptr + 14, dst + 14);
|
|
|
|
parseHex<2>(src_ptr + 19, dst);
|
|
|
|
parseHex<6>(src_ptr + 24, dst + 2);
|
2023-06-02 19:36:37 +00:00
|
|
|
}
|
|
|
|
else if (size == 32)
|
|
|
|
{
|
2023-08-16 20:12:31 +00:00
|
|
|
parseHex<8>(src_ptr, dst + 8);
|
|
|
|
parseHex<8>(src_ptr + 16, dst);
|
2023-06-02 19:36:37 +00:00
|
|
|
}
|
2023-06-05 15:18:03 +00:00
|
|
|
else
|
|
|
|
throw Exception(ErrorCodes::CANNOT_PARSE_UUID, "Unexpected length when trying to parse UUID ({})", size);
|
2020-06-23 03:14:16 +00:00
|
|
|
|
2023-06-02 19:36:37 +00:00
|
|
|
return uuid;
|
2020-06-23 03:14:16 +00:00
|
|
|
}
|
|
|
|
|
2018-10-09 21:32:15 +00:00
|
|
|
void NO_INLINE throwAtAssertionFailed(const char * s, ReadBuffer & buf)
|
2012-09-24 05:40:45 +00:00
|
|
|
{
|
2017-07-31 21:39:24 +00:00
|
|
|
WriteBufferFromOwnString out;
|
2023-01-24 22:21:29 +00:00
|
|
|
out << quote << s;
|
2012-09-24 05:40:45 +00:00
|
|
|
|
2017-07-31 21:39:24 +00:00
|
|
|
if (buf.eof())
|
|
|
|
out << " at end of stream.";
|
|
|
|
else
|
2020-04-07 14:15:14 +00:00
|
|
|
out << " before: " << quote << String(buf.position(), std::min(SHOW_CHARS_ON_SYNTAX_ERROR, buf.buffer().end() - buf.position()));
|
2012-09-24 05:40:45 +00:00
|
|
|
|
2023-11-21 13:13:42 +00:00
|
|
|
throw Exception(ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED, "Cannot parse input: expected {}", out.str());
|
2012-09-24 05:40:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2015-10-05 14:20:56 +00:00
|
|
|
bool checkString(const char * s, ReadBuffer & buf)
|
2010-06-04 18:25:25 +00:00
|
|
|
{
|
|
|
|
for (; *s; ++s)
|
|
|
|
{
|
|
|
|
if (buf.eof() || *buf.position() != *s)
|
2015-10-05 14:20:56 +00:00
|
|
|
return false;
|
2010-06-04 18:25:25 +00:00
|
|
|
++buf.position();
|
|
|
|
}
|
2015-10-05 14:20:56 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-08-16 21:23:53 +00:00
|
|
|
|
2016-12-30 05:13:14 +00:00
|
|
|
bool checkStringCaseInsensitive(const char * s, ReadBuffer & buf)
|
2016-08-16 21:23:53 +00:00
|
|
|
{
|
|
|
|
for (; *s; ++s)
|
|
|
|
{
|
|
|
|
if (buf.eof())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
char c = *buf.position();
|
2016-12-30 05:13:14 +00:00
|
|
|
if (!equalsCaseInsensitive(*s, c))
|
2016-08-16 21:23:53 +00:00
|
|
|
return false;
|
|
|
|
|
|
|
|
++buf.position();
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2015-10-05 14:20:56 +00:00
|
|
|
void assertString(const char * s, ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
if (!checkString(s, buf))
|
|
|
|
throwAtAssertionFailed(s, buf);
|
2010-06-04 18:25:25 +00:00
|
|
|
}
|
|
|
|
|
2015-06-03 15:32:06 +00:00
|
|
|
|
2014-03-27 11:29:40 +00:00
|
|
|
void assertEOF(ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
if (!buf.eof())
|
|
|
|
throwAtAssertionFailed("eof", buf);
|
|
|
|
}
|
|
|
|
|
2021-12-30 13:15:28 +00:00
|
|
|
void assertNotEOF(ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
if (buf.eof())
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF, "Attempt to read after EOF");
|
2021-12-30 13:15:28 +00:00
|
|
|
}
|
|
|
|
|
2016-02-16 16:39:39 +00:00
|
|
|
|
2016-12-30 05:13:14 +00:00
|
|
|
void assertStringCaseInsensitive(const char * s, ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
if (!checkStringCaseInsensitive(s, buf))
|
|
|
|
throwAtAssertionFailed(s, buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool checkStringByFirstCharacterAndAssertTheRest(const char * s, ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
if (buf.eof() || *buf.position() != *s)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
assertString(s, buf);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool checkStringByFirstCharacterAndAssertTheRestCaseInsensitive(const char * s, ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
if (buf.eof())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
char c = *buf.position();
|
|
|
|
if (!equalsCaseInsensitive(*s, c))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
assertStringCaseInsensitive(s, buf);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2016-02-16 16:39:39 +00:00
|
|
|
template <typename T>
|
Padding for IO buffers.
Testing data
```
select 'aaaaaaaa','bbbbbbbb','cccccccc','dddddddd','eeeeeeee','ffffffff','gggg','hhh' from numbers(3000000) into outfile '/tmp/test.tsv'
```
Testing command
```
echo "select count() from file('/tmp/test.tsv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
TSV parser has less overhead than CSV, using it would better unveil the benefits of memcpySmall.
Before
```
QPS: 1.662, RPS: 4985463.906, MiB/s: 603.823, result RPS: 1.662, result MiB/s: 0.000.
0.000% 0.559 sec.
10.000% 0.564 sec.
20.000% 0.568 sec.
30.000% 0.572 sec.
40.000% 0.575 sec.
50.000% 0.581 sec.
60.000% 0.592 sec.
70.000% 0.624 sec.
80.000% 0.639 sec.
90.000% 0.664 sec.
95.000% 0.686 sec.
99.000% 0.711 sec.
99.900% 0.715 sec.
99.990% 0.716 sec.
```
After
```
QPS: 1.861, RPS: 5582303.107, MiB/s: 676.110, result RPS: 1.861, result MiB/s: 0.000.
0.000% 0.510 sec.
10.000% 0.514 sec.
20.000% 0.517 sec.
30.000% 0.521 sec.
40.000% 0.523 sec.
50.000% 0.527 sec.
60.000% 0.530 sec.
70.000% 0.539 sec.
80.000% 0.558 sec.
90.000% 0.584 sec.
95.000% 0.589 sec.
99.000% 0.608 sec.
99.900% 0.655 sec.
99.990% 0.663 sec.
```
2018-08-27 19:14:15 +00:00
|
|
|
static void appendToStringOrVector(T & s, ReadBuffer & rb, const char * end)
|
2016-02-16 16:39:39 +00:00
|
|
|
{
|
Padding for IO buffers.
Testing data
```
select 'aaaaaaaa','bbbbbbbb','cccccccc','dddddddd','eeeeeeee','ffffffff','gggg','hhh' from numbers(3000000) into outfile '/tmp/test.tsv'
```
Testing command
```
echo "select count() from file('/tmp/test.tsv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
TSV parser has less overhead than CSV, using it would better unveil the benefits of memcpySmall.
Before
```
QPS: 1.662, RPS: 4985463.906, MiB/s: 603.823, result RPS: 1.662, result MiB/s: 0.000.
0.000% 0.559 sec.
10.000% 0.564 sec.
20.000% 0.568 sec.
30.000% 0.572 sec.
40.000% 0.575 sec.
50.000% 0.581 sec.
60.000% 0.592 sec.
70.000% 0.624 sec.
80.000% 0.639 sec.
90.000% 0.664 sec.
95.000% 0.686 sec.
99.000% 0.711 sec.
99.900% 0.715 sec.
99.990% 0.716 sec.
```
After
```
QPS: 1.861, RPS: 5582303.107, MiB/s: 676.110, result RPS: 1.861, result MiB/s: 0.000.
0.000% 0.510 sec.
10.000% 0.514 sec.
20.000% 0.517 sec.
30.000% 0.521 sec.
40.000% 0.523 sec.
50.000% 0.527 sec.
60.000% 0.530 sec.
70.000% 0.539 sec.
80.000% 0.558 sec.
90.000% 0.584 sec.
95.000% 0.589 sec.
99.000% 0.608 sec.
99.900% 0.655 sec.
99.990% 0.663 sec.
```
2018-08-27 19:14:15 +00:00
|
|
|
s.append(rb.position(), end - rb.position());
|
2016-02-16 16:39:39 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
template <>
|
Padding for IO buffers.
Testing data
```
select 'aaaaaaaa','bbbbbbbb','cccccccc','dddddddd','eeeeeeee','ffffffff','gggg','hhh' from numbers(3000000) into outfile '/tmp/test.tsv'
```
Testing command
```
echo "select count() from file('/tmp/test.tsv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
TSV parser has less overhead than CSV, using it would better unveil the benefits of memcpySmall.
Before
```
QPS: 1.662, RPS: 4985463.906, MiB/s: 603.823, result RPS: 1.662, result MiB/s: 0.000.
0.000% 0.559 sec.
10.000% 0.564 sec.
20.000% 0.568 sec.
30.000% 0.572 sec.
40.000% 0.575 sec.
50.000% 0.581 sec.
60.000% 0.592 sec.
70.000% 0.624 sec.
80.000% 0.639 sec.
90.000% 0.664 sec.
95.000% 0.686 sec.
99.000% 0.711 sec.
99.900% 0.715 sec.
99.990% 0.716 sec.
```
After
```
QPS: 1.861, RPS: 5582303.107, MiB/s: 676.110, result RPS: 1.861, result MiB/s: 0.000.
0.000% 0.510 sec.
10.000% 0.514 sec.
20.000% 0.517 sec.
30.000% 0.521 sec.
40.000% 0.523 sec.
50.000% 0.527 sec.
60.000% 0.530 sec.
70.000% 0.539 sec.
80.000% 0.558 sec.
90.000% 0.584 sec.
95.000% 0.589 sec.
99.000% 0.608 sec.
99.900% 0.655 sec.
99.990% 0.663 sec.
```
2018-08-27 19:14:15 +00:00
|
|
|
inline void appendToStringOrVector(PaddedPODArray<UInt8> & s, ReadBuffer & rb, const char * end)
|
2016-02-16 16:39:39 +00:00
|
|
|
{
|
Padding for IO buffers.
Testing data
```
select 'aaaaaaaa','bbbbbbbb','cccccccc','dddddddd','eeeeeeee','ffffffff','gggg','hhh' from numbers(3000000) into outfile '/tmp/test.tsv'
```
Testing command
```
echo "select count() from file('/tmp/test.tsv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
TSV parser has less overhead than CSV, using it would better unveil the benefits of memcpySmall.
Before
```
QPS: 1.662, RPS: 4985463.906, MiB/s: 603.823, result RPS: 1.662, result MiB/s: 0.000.
0.000% 0.559 sec.
10.000% 0.564 sec.
20.000% 0.568 sec.
30.000% 0.572 sec.
40.000% 0.575 sec.
50.000% 0.581 sec.
60.000% 0.592 sec.
70.000% 0.624 sec.
80.000% 0.639 sec.
90.000% 0.664 sec.
95.000% 0.686 sec.
99.000% 0.711 sec.
99.900% 0.715 sec.
99.990% 0.716 sec.
```
After
```
QPS: 1.861, RPS: 5582303.107, MiB/s: 676.110, result RPS: 1.861, result MiB/s: 0.000.
0.000% 0.510 sec.
10.000% 0.514 sec.
20.000% 0.517 sec.
30.000% 0.521 sec.
40.000% 0.523 sec.
50.000% 0.527 sec.
60.000% 0.530 sec.
70.000% 0.539 sec.
80.000% 0.558 sec.
90.000% 0.584 sec.
95.000% 0.589 sec.
99.000% 0.608 sec.
99.900% 0.655 sec.
99.990% 0.663 sec.
```
2018-08-27 19:14:15 +00:00
|
|
|
if (rb.isPadded())
|
|
|
|
s.insertSmallAllowReadWriteOverflow15(rb.position(), end);
|
|
|
|
else
|
|
|
|
s.insert(rb.position(), end);
|
2016-02-16 16:39:39 +00:00
|
|
|
}
|
|
|
|
|
2019-07-01 05:58:31 +00:00
|
|
|
template <>
|
|
|
|
inline void appendToStringOrVector(PODArray<char> & s, ReadBuffer & rb, const char * end)
|
|
|
|
{
|
|
|
|
s.insert(rb.position(), end);
|
|
|
|
}
|
|
|
|
|
2020-12-02 21:05:51 +00:00
|
|
|
template <char... chars, typename Vector>
|
|
|
|
void readStringUntilCharsInto(Vector & s, ReadBuffer & buf)
|
2010-06-04 18:25:25 +00:00
|
|
|
{
|
|
|
|
while (!buf.eof())
|
|
|
|
{
|
2020-12-02 21:05:51 +00:00
|
|
|
char * next_pos = find_first_symbols<chars...>(buf.position(), buf.buffer().end());
|
2010-06-04 18:25:25 +00:00
|
|
|
|
Padding for IO buffers.
Testing data
```
select 'aaaaaaaa','bbbbbbbb','cccccccc','dddddddd','eeeeeeee','ffffffff','gggg','hhh' from numbers(3000000) into outfile '/tmp/test.tsv'
```
Testing command
```
echo "select count() from file('/tmp/test.tsv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
TSV parser has less overhead than CSV, using it would better unveil the benefits of memcpySmall.
Before
```
QPS: 1.662, RPS: 4985463.906, MiB/s: 603.823, result RPS: 1.662, result MiB/s: 0.000.
0.000% 0.559 sec.
10.000% 0.564 sec.
20.000% 0.568 sec.
30.000% 0.572 sec.
40.000% 0.575 sec.
50.000% 0.581 sec.
60.000% 0.592 sec.
70.000% 0.624 sec.
80.000% 0.639 sec.
90.000% 0.664 sec.
95.000% 0.686 sec.
99.000% 0.711 sec.
99.900% 0.715 sec.
99.990% 0.716 sec.
```
After
```
QPS: 1.861, RPS: 5582303.107, MiB/s: 676.110, result RPS: 1.861, result MiB/s: 0.000.
0.000% 0.510 sec.
10.000% 0.514 sec.
20.000% 0.517 sec.
30.000% 0.521 sec.
40.000% 0.523 sec.
50.000% 0.527 sec.
60.000% 0.530 sec.
70.000% 0.539 sec.
80.000% 0.558 sec.
90.000% 0.584 sec.
95.000% 0.589 sec.
99.000% 0.608 sec.
99.900% 0.655 sec.
99.990% 0.663 sec.
```
2018-08-27 19:14:15 +00:00
|
|
|
appendToStringOrVector(s, buf, next_pos);
|
2018-08-25 15:32:55 +00:00
|
|
|
buf.position() = next_pos;
|
2010-06-04 18:25:25 +00:00
|
|
|
|
2015-02-07 23:13:04 +00:00
|
|
|
if (buf.hasPendingData())
|
2010-06-04 18:25:25 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-07-01 05:58:31 +00:00
|
|
|
template <typename Vector>
|
2020-12-02 21:05:51 +00:00
|
|
|
void readStringInto(Vector & s, ReadBuffer & buf)
|
2019-07-01 05:58:31 +00:00
|
|
|
{
|
2020-12-02 21:05:51 +00:00
|
|
|
readStringUntilCharsInto<'\t', '\n'>(s, buf);
|
|
|
|
}
|
2019-07-01 05:58:31 +00:00
|
|
|
|
2020-12-02 21:05:51 +00:00
|
|
|
template <typename Vector>
|
|
|
|
void readStringUntilWhitespaceInto(Vector & s, ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
readStringUntilCharsInto<' '>(s, buf);
|
|
|
|
}
|
2019-07-01 05:58:31 +00:00
|
|
|
|
2022-02-03 21:07:31 +00:00
|
|
|
template <typename Vector>
|
|
|
|
void readStringUntilNewlineInto(Vector & s, ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
readStringUntilCharsInto<'\n'>(s, buf);
|
|
|
|
}
|
|
|
|
|
2022-02-03 22:44:43 +00:00
|
|
|
template void readStringUntilNewlineInto<PaddedPODArray<UInt8>>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
|
2022-02-03 21:25:36 +00:00
|
|
|
template void readStringUntilNewlineInto<String>(String & s, ReadBuffer & buf);
|
|
|
|
|
2020-12-02 21:05:51 +00:00
|
|
|
template <typename Vector>
|
|
|
|
void readNullTerminated(Vector & s, ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
readStringUntilCharsInto<'\0'>(s, buf);
|
2019-07-01 05:58:31 +00:00
|
|
|
buf.ignore();
|
|
|
|
}
|
|
|
|
|
2020-12-02 21:05:51 +00:00
|
|
|
void readStringUntilWhitespace(String & s, ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
s.clear();
|
|
|
|
readStringUntilWhitespaceInto(s, buf);
|
|
|
|
}
|
|
|
|
|
2019-07-01 05:58:31 +00:00
|
|
|
template void readNullTerminated<PODArray<char>>(PODArray<char> & s, ReadBuffer & buf);
|
|
|
|
template void readNullTerminated<String>(String & s, ReadBuffer & buf);
|
|
|
|
|
2016-02-16 16:39:39 +00:00
|
|
|
void readString(String & s, ReadBuffer & buf)
|
2015-09-08 14:24:25 +00:00
|
|
|
{
|
2016-02-07 08:42:21 +00:00
|
|
|
s.clear();
|
2016-02-16 16:39:39 +00:00
|
|
|
readStringInto(s, buf);
|
|
|
|
}
|
|
|
|
|
2016-04-15 00:33:21 +00:00
|
|
|
template void readStringInto<PaddedPODArray<UInt8>>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
|
2021-05-07 02:19:54 +00:00
|
|
|
template void readStringInto<String>(String & s, ReadBuffer & buf);
|
2022-05-13 15:08:02 +00:00
|
|
|
template void readStringInto<NullOutput>(NullOutput & s, ReadBuffer & buf);
|
2016-02-16 16:39:39 +00:00
|
|
|
|
|
|
|
template <typename Vector>
|
|
|
|
void readStringUntilEOFInto(Vector & s, ReadBuffer & buf)
|
|
|
|
{
|
2015-09-08 14:24:25 +00:00
|
|
|
while (!buf.eof())
|
|
|
|
{
|
Padding for IO buffers.
Testing data
```
select 'aaaaaaaa','bbbbbbbb','cccccccc','dddddddd','eeeeeeee','ffffffff','gggg','hhh' from numbers(3000000) into outfile '/tmp/test.tsv'
```
Testing command
```
echo "select count() from file('/tmp/test.tsv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
TSV parser has less overhead than CSV, using it would better unveil the benefits of memcpySmall.
Before
```
QPS: 1.662, RPS: 4985463.906, MiB/s: 603.823, result RPS: 1.662, result MiB/s: 0.000.
0.000% 0.559 sec.
10.000% 0.564 sec.
20.000% 0.568 sec.
30.000% 0.572 sec.
40.000% 0.575 sec.
50.000% 0.581 sec.
60.000% 0.592 sec.
70.000% 0.624 sec.
80.000% 0.639 sec.
90.000% 0.664 sec.
95.000% 0.686 sec.
99.000% 0.711 sec.
99.900% 0.715 sec.
99.990% 0.716 sec.
```
After
```
QPS: 1.861, RPS: 5582303.107, MiB/s: 676.110, result RPS: 1.861, result MiB/s: 0.000.
0.000% 0.510 sec.
10.000% 0.514 sec.
20.000% 0.517 sec.
30.000% 0.521 sec.
40.000% 0.523 sec.
50.000% 0.527 sec.
60.000% 0.530 sec.
70.000% 0.539 sec.
80.000% 0.558 sec.
90.000% 0.584 sec.
95.000% 0.589 sec.
99.000% 0.608 sec.
99.900% 0.655 sec.
99.990% 0.663 sec.
```
2018-08-27 19:14:15 +00:00
|
|
|
appendToStringOrVector(s, buf, buf.buffer().end());
|
2018-08-25 15:32:55 +00:00
|
|
|
buf.position() = buf.buffer().end();
|
2015-09-08 14:24:25 +00:00
|
|
|
}
|
|
|
|
}
|
2013-01-05 10:07:01 +00:00
|
|
|
|
2019-02-27 16:41:51 +00:00
|
|
|
|
2016-02-16 16:39:39 +00:00
|
|
|
void readStringUntilEOF(String & s, ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
s.clear();
|
|
|
|
readStringUntilEOFInto(s, buf);
|
|
|
|
}
|
|
|
|
|
2019-02-27 16:41:51 +00:00
|
|
|
template <typename Vector>
|
|
|
|
void readEscapedStringUntilEOLInto(Vector & s, ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
while (!buf.eof())
|
|
|
|
{
|
|
|
|
char * next_pos = find_first_symbols<'\n', '\\'>(buf.position(), buf.buffer().end());
|
|
|
|
|
|
|
|
appendToStringOrVector(s, buf, next_pos);
|
|
|
|
buf.position() = next_pos;
|
|
|
|
|
|
|
|
if (!buf.hasPendingData())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (*buf.position() == '\n')
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (*buf.position() == '\\')
|
|
|
|
parseComplexEscapeSequence(s, buf);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void readEscapedStringUntilEOL(String & s, ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
s.clear();
|
|
|
|
readEscapedStringUntilEOLInto(s, buf);
|
|
|
|
}
|
|
|
|
|
2016-04-15 00:33:21 +00:00
|
|
|
template void readStringUntilEOFInto<PaddedPODArray<UInt8>>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
|
2016-02-16 16:39:39 +00:00
|
|
|
|
|
|
|
|
2017-03-25 20:12:56 +00:00
|
|
|
/** Parse the escape sequence, which can be simple (one character after backslash) or more complex (multiple characters).
|
|
|
|
* It is assumed that the cursor is located on the `\` symbol
|
2015-11-25 03:11:17 +00:00
|
|
|
*/
|
2022-12-07 21:19:27 +00:00
|
|
|
template <typename Vector, typename ReturnType = void>
|
|
|
|
static ReturnType parseComplexEscapeSequence(Vector & s, ReadBuffer & buf)
|
2015-11-25 03:11:17 +00:00
|
|
|
{
|
2023-04-21 14:31:59 +00:00
|
|
|
static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
|
|
|
|
|
|
|
|
auto error = [](const char * message [[maybe_unused]], int code [[maybe_unused]])
|
|
|
|
{
|
|
|
|
if constexpr (throw_exception)
|
|
|
|
throw Exception::createDeprecated(message, code);
|
|
|
|
return ReturnType(false);
|
|
|
|
};
|
|
|
|
|
2015-11-25 03:11:17 +00:00
|
|
|
++buf.position();
|
2023-04-21 14:31:59 +00:00
|
|
|
|
2015-11-25 03:11:17 +00:00
|
|
|
if (buf.eof())
|
2022-12-07 21:19:27 +00:00
|
|
|
{
|
2023-04-21 14:31:59 +00:00
|
|
|
return error("Cannot parse escape sequence", ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE);
|
2022-12-07 21:19:27 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-05-26 14:24:20 +00:00
|
|
|
char char_after_backslash = *buf.position();
|
|
|
|
|
|
|
|
if (char_after_backslash == 'x')
|
2015-11-25 03:11:17 +00:00
|
|
|
{
|
|
|
|
++buf.position();
|
2017-08-09 01:34:01 +00:00
|
|
|
/// escape sequence of the form \xAA
|
|
|
|
char hex_code[2];
|
2023-03-21 15:21:09 +00:00
|
|
|
|
2023-04-21 14:31:59 +00:00
|
|
|
auto bytes_read = buf.read(hex_code, sizeof(hex_code));
|
|
|
|
|
|
|
|
if (bytes_read != sizeof(hex_code))
|
2023-03-21 15:21:09 +00:00
|
|
|
{
|
2023-04-21 14:31:59 +00:00
|
|
|
return error("Cannot parse escape sequence", ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE);
|
2023-03-21 15:21:09 +00:00
|
|
|
}
|
|
|
|
|
2017-08-09 01:34:01 +00:00
|
|
|
s.push_back(unhex2(hex_code));
|
2015-11-25 03:11:17 +00:00
|
|
|
}
|
2020-05-26 14:24:20 +00:00
|
|
|
else if (char_after_backslash == 'N')
|
2016-12-30 05:13:14 +00:00
|
|
|
{
|
|
|
|
/// Support for NULLs: \N sequence must be parsed as empty string.
|
|
|
|
++buf.position();
|
|
|
|
}
|
2015-11-25 03:11:17 +00:00
|
|
|
else
|
|
|
|
{
|
2017-03-25 20:12:56 +00:00
|
|
|
/// The usual escape sequence of a single character.
|
2020-05-26 14:24:20 +00:00
|
|
|
char decoded_char = parseEscapeSequence(char_after_backslash);
|
|
|
|
|
|
|
|
/// For convenience using LIKE and regular expressions,
|
|
|
|
/// we leave backslash when user write something like 'Hello 100\%':
|
|
|
|
/// it is parsed like Hello 100\% instead of Hello 100%
|
2020-05-26 21:48:29 +00:00
|
|
|
if (decoded_char != '\\'
|
|
|
|
&& decoded_char != '\''
|
|
|
|
&& decoded_char != '"'
|
|
|
|
&& decoded_char != '`' /// MySQL style identifiers
|
|
|
|
&& decoded_char != '/' /// JavaScript in HTML
|
2022-04-15 22:20:47 +00:00
|
|
|
&& decoded_char != '=' /// TSKV format invented somewhere
|
2020-05-26 21:48:29 +00:00
|
|
|
&& !isControlASCII(decoded_char))
|
|
|
|
{
|
2020-05-26 14:24:20 +00:00
|
|
|
s.push_back('\\');
|
2020-05-26 21:48:29 +00:00
|
|
|
}
|
2020-05-26 14:24:20 +00:00
|
|
|
|
|
|
|
s.push_back(decoded_char);
|
2015-11-25 03:11:17 +00:00
|
|
|
++buf.position();
|
|
|
|
}
|
2022-12-07 21:19:27 +00:00
|
|
|
|
|
|
|
return ReturnType(true);
|
2015-11-25 03:11:17 +00:00
|
|
|
}
|
|
|
|
|
2023-03-21 15:21:09 +00:00
|
|
|
bool parseComplexEscapeSequence(String & s, ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
return parseComplexEscapeSequence<String, bool>(s, buf);
|
|
|
|
}
|
2015-11-25 03:11:17 +00:00
|
|
|
|
2017-08-09 01:34:01 +00:00
|
|
|
template <typename Vector, typename ReturnType>
|
|
|
|
static ReturnType parseJSONEscapeSequence(Vector & s, ReadBuffer & buf)
|
2016-02-18 11:44:50 +00:00
|
|
|
{
|
2017-12-25 04:01:46 +00:00
|
|
|
static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
|
2017-08-09 01:34:01 +00:00
|
|
|
|
2018-07-16 06:09:27 +00:00
|
|
|
auto error = [](const char * message [[maybe_unused]], int code [[maybe_unused]])
|
2017-08-09 01:34:01 +00:00
|
|
|
{
|
2018-07-14 23:39:00 +00:00
|
|
|
if constexpr (throw_exception)
|
2023-01-23 13:16:14 +00:00
|
|
|
throw Exception::createDeprecated(message, code);
|
2017-08-09 01:34:01 +00:00
|
|
|
return ReturnType(false);
|
|
|
|
};
|
|
|
|
|
2016-02-18 11:44:50 +00:00
|
|
|
++buf.position();
|
2021-07-14 11:59:06 +00:00
|
|
|
|
2016-02-18 11:44:50 +00:00
|
|
|
if (buf.eof())
|
2017-08-09 01:34:01 +00:00
|
|
|
return error("Cannot parse escape sequence", ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2021-07-14 11:59:06 +00:00
|
|
|
assert(buf.hasPendingData());
|
|
|
|
|
2017-08-09 01:34:01 +00:00
|
|
|
switch (*buf.position())
|
2016-02-18 11:44:50 +00:00
|
|
|
{
|
|
|
|
case '"':
|
|
|
|
s.push_back('"');
|
|
|
|
break;
|
|
|
|
case '\\':
|
|
|
|
s.push_back('\\');
|
|
|
|
break;
|
|
|
|
case '/':
|
|
|
|
s.push_back('/');
|
|
|
|
break;
|
|
|
|
case 'b':
|
|
|
|
s.push_back('\b');
|
|
|
|
break;
|
|
|
|
case 'f':
|
|
|
|
s.push_back('\f');
|
|
|
|
break;
|
|
|
|
case 'n':
|
|
|
|
s.push_back('\n');
|
|
|
|
break;
|
|
|
|
case 'r':
|
|
|
|
s.push_back('\r');
|
|
|
|
break;
|
|
|
|
case 't':
|
|
|
|
s.push_back('\t');
|
|
|
|
break;
|
|
|
|
case 'u':
|
|
|
|
{
|
|
|
|
++buf.position();
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-02-18 11:44:50 +00:00
|
|
|
char hex_code[4];
|
2017-08-09 01:34:01 +00:00
|
|
|
if (4 != buf.read(hex_code, 4))
|
|
|
|
return error("Cannot parse escape sequence: less than four bytes after \\u", ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-03-25 20:12:56 +00:00
|
|
|
/// \u0000 - special case
|
2017-08-09 01:34:01 +00:00
|
|
|
if (0 == memcmp(hex_code, "0000", 4))
|
2016-02-18 11:44:50 +00:00
|
|
|
{
|
|
|
|
s.push_back(0);
|
2017-08-09 01:34:01 +00:00
|
|
|
return ReturnType(true);
|
2016-02-18 11:44:50 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-08-09 01:34:01 +00:00
|
|
|
UInt16 code_point = unhex4(hex_code);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-02-18 11:44:50 +00:00
|
|
|
if (code_point <= 0x7F)
|
|
|
|
{
|
|
|
|
s.push_back(code_point);
|
|
|
|
}
|
2017-08-09 01:34:01 +00:00
|
|
|
else if (code_point <= 0x07FF)
|
2016-02-18 11:44:50 +00:00
|
|
|
{
|
|
|
|
s.push_back(((code_point >> 6) & 0x1F) | 0xC0);
|
|
|
|
s.push_back((code_point & 0x3F) | 0x80);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2017-03-25 20:12:56 +00:00
|
|
|
/// Surrogate pair.
|
2016-02-18 11:44:50 +00:00
|
|
|
if (code_point >= 0xD800 && code_point <= 0xDBFF)
|
|
|
|
{
|
2017-08-09 01:34:01 +00:00
|
|
|
if (!checkString("\\u", buf))
|
|
|
|
return error("Cannot parse escape sequence: missing second part of surrogate pair", ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE);
|
|
|
|
|
2016-02-18 11:44:50 +00:00
|
|
|
char second_hex_code[4];
|
2017-08-09 01:34:01 +00:00
|
|
|
if (4 != buf.read(second_hex_code, 4))
|
|
|
|
return error("Cannot parse escape sequence: less than four bytes after \\u of second part of surrogate pair",
|
|
|
|
ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-08-09 01:34:01 +00:00
|
|
|
UInt16 second_code_point = unhex4(second_hex_code);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-02-18 11:44:50 +00:00
|
|
|
if (second_code_point >= 0xDC00 && second_code_point <= 0xDFFF)
|
|
|
|
{
|
2016-02-18 11:54:16 +00:00
|
|
|
UInt32 full_code_point = 0x10000 + (code_point - 0xD800) * 1024 + (second_code_point - 0xDC00);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-02-18 11:44:50 +00:00
|
|
|
s.push_back(((full_code_point >> 18) & 0x07) | 0xF0);
|
|
|
|
s.push_back(((full_code_point >> 12) & 0x3F) | 0x80);
|
|
|
|
s.push_back(((full_code_point >> 6) & 0x3F) | 0x80);
|
|
|
|
s.push_back((full_code_point & 0x3F) | 0x80);
|
|
|
|
}
|
|
|
|
else
|
2017-08-09 01:34:01 +00:00
|
|
|
return error("Incorrect surrogate pair of unicode escape sequences in JSON", ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE);
|
2016-02-18 11:44:50 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
s.push_back(((code_point >> 12) & 0x0F) | 0xE0);
|
|
|
|
s.push_back(((code_point >> 6) & 0x3F) | 0x80);
|
|
|
|
s.push_back((code_point & 0x3F) | 0x80);
|
|
|
|
}
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-08-09 01:34:01 +00:00
|
|
|
return ReturnType(true);
|
2016-02-18 11:44:50 +00:00
|
|
|
}
|
|
|
|
default:
|
|
|
|
s.push_back(*buf.position());
|
|
|
|
break;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-02-18 11:44:50 +00:00
|
|
|
++buf.position();
|
2017-08-09 01:34:01 +00:00
|
|
|
return ReturnType(true);
|
2016-02-18 11:44:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2023-01-06 20:46:43 +00:00
|
|
|
template <typename Vector, bool parse_complex_escape_sequence>
|
|
|
|
void readEscapedStringIntoImpl(Vector & s, ReadBuffer & buf)
|
2010-06-04 18:25:25 +00:00
|
|
|
{
|
|
|
|
while (!buf.eof())
|
|
|
|
{
|
2018-08-25 15:32:55 +00:00
|
|
|
char * next_pos = find_first_symbols<'\t', '\n', '\\'>(buf.position(), buf.buffer().end());
|
2010-06-04 18:25:25 +00:00
|
|
|
|
Padding for IO buffers.
Testing data
```
select 'aaaaaaaa','bbbbbbbb','cccccccc','dddddddd','eeeeeeee','ffffffff','gggg','hhh' from numbers(3000000) into outfile '/tmp/test.tsv'
```
Testing command
```
echo "select count() from file('/tmp/test.tsv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
TSV parser has less overhead than CSV, using it would better unveil the benefits of memcpySmall.
Before
```
QPS: 1.662, RPS: 4985463.906, MiB/s: 603.823, result RPS: 1.662, result MiB/s: 0.000.
0.000% 0.559 sec.
10.000% 0.564 sec.
20.000% 0.568 sec.
30.000% 0.572 sec.
40.000% 0.575 sec.
50.000% 0.581 sec.
60.000% 0.592 sec.
70.000% 0.624 sec.
80.000% 0.639 sec.
90.000% 0.664 sec.
95.000% 0.686 sec.
99.000% 0.711 sec.
99.900% 0.715 sec.
99.990% 0.716 sec.
```
After
```
QPS: 1.861, RPS: 5582303.107, MiB/s: 676.110, result RPS: 1.861, result MiB/s: 0.000.
0.000% 0.510 sec.
10.000% 0.514 sec.
20.000% 0.517 sec.
30.000% 0.521 sec.
40.000% 0.523 sec.
50.000% 0.527 sec.
60.000% 0.530 sec.
70.000% 0.539 sec.
80.000% 0.558 sec.
90.000% 0.584 sec.
95.000% 0.589 sec.
99.000% 0.608 sec.
99.900% 0.655 sec.
99.990% 0.663 sec.
```
2018-08-27 19:14:15 +00:00
|
|
|
appendToStringOrVector(s, buf, next_pos);
|
2018-08-25 15:32:55 +00:00
|
|
|
buf.position() = next_pos;
|
2010-06-04 18:25:25 +00:00
|
|
|
|
2015-02-07 23:13:04 +00:00
|
|
|
if (!buf.hasPendingData())
|
2011-12-26 02:17:33 +00:00
|
|
|
continue;
|
|
|
|
|
2010-06-04 18:25:25 +00:00
|
|
|
if (*buf.position() == '\t' || *buf.position() == '\n')
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (*buf.position() == '\\')
|
2023-01-06 20:46:43 +00:00
|
|
|
{
|
|
|
|
if constexpr (parse_complex_escape_sequence)
|
|
|
|
{
|
|
|
|
parseComplexEscapeSequence(s, buf);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
s.push_back(*buf.position());
|
|
|
|
++buf.position();
|
|
|
|
if (!buf.eof())
|
|
|
|
{
|
|
|
|
s.push_back(*buf.position());
|
|
|
|
++buf.position();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2010-06-04 18:25:25 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-01-06 20:46:43 +00:00
|
|
|
template <typename Vector>
|
|
|
|
void readEscapedStringInto(Vector & s, ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
readEscapedStringIntoImpl<Vector, true>(s, buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2016-02-16 16:39:39 +00:00
|
|
|
void readEscapedString(String & s, ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
s.clear();
|
|
|
|
readEscapedStringInto(s, buf);
|
|
|
|
}
|
|
|
|
|
2016-04-15 00:33:21 +00:00
|
|
|
template void readEscapedStringInto<PaddedPODArray<UInt8>>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
|
2020-10-29 17:22:48 +00:00
|
|
|
template void readEscapedStringInto<NullOutput>(NullOutput & s, ReadBuffer & buf);
|
2016-02-16 16:39:39 +00:00
|
|
|
|
2011-06-15 18:54:18 +00:00
|
|
|
|
2017-06-25 03:43:37 +00:00
|
|
|
/** If enable_sql_style_quoting == true,
|
|
|
|
* strings like 'abc''def' will be parsed as abc'def.
|
|
|
|
* Please note, that even with SQL style quoting enabled,
|
|
|
|
* backslash escape sequences are also parsed,
|
|
|
|
* that could be slightly confusing.
|
|
|
|
*/
|
2022-12-07 21:19:27 +00:00
|
|
|
template <char quote, bool enable_sql_style_quoting, typename Vector, typename ReturnType = void>
|
|
|
|
static ReturnType readAnyQuotedStringInto(Vector & s, ReadBuffer & buf)
|
2010-06-04 18:25:25 +00:00
|
|
|
{
|
2022-12-07 21:19:27 +00:00
|
|
|
static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
|
2011-06-15 18:54:18 +00:00
|
|
|
if (buf.eof() || *buf.position() != quote)
|
2020-11-18 10:38:30 +00:00
|
|
|
{
|
2022-12-07 21:19:27 +00:00
|
|
|
if constexpr (throw_exception)
|
2023-11-21 13:13:42 +00:00
|
|
|
throw Exception(ErrorCodes::CANNOT_PARSE_QUOTED_STRING,
|
2022-12-07 21:19:27 +00:00
|
|
|
"Cannot parse quoted string: expected opening quote '{}', got '{}'",
|
|
|
|
std::string{quote}, buf.eof() ? "EOF" : std::string{*buf.position()});
|
|
|
|
else
|
2022-12-12 22:00:45 +00:00
|
|
|
return ReturnType(false);
|
2020-11-18 10:38:30 +00:00
|
|
|
}
|
|
|
|
|
2010-06-04 18:25:25 +00:00
|
|
|
++buf.position();
|
|
|
|
|
|
|
|
while (!buf.eof())
|
|
|
|
{
|
2018-08-25 15:32:55 +00:00
|
|
|
char * next_pos = find_first_symbols<'\\', quote>(buf.position(), buf.buffer().end());
|
2010-06-04 18:25:25 +00:00
|
|
|
|
Padding for IO buffers.
Testing data
```
select 'aaaaaaaa','bbbbbbbb','cccccccc','dddddddd','eeeeeeee','ffffffff','gggg','hhh' from numbers(3000000) into outfile '/tmp/test.tsv'
```
Testing command
```
echo "select count() from file('/tmp/test.tsv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
TSV parser has less overhead than CSV, using it would better unveil the benefits of memcpySmall.
Before
```
QPS: 1.662, RPS: 4985463.906, MiB/s: 603.823, result RPS: 1.662, result MiB/s: 0.000.
0.000% 0.559 sec.
10.000% 0.564 sec.
20.000% 0.568 sec.
30.000% 0.572 sec.
40.000% 0.575 sec.
50.000% 0.581 sec.
60.000% 0.592 sec.
70.000% 0.624 sec.
80.000% 0.639 sec.
90.000% 0.664 sec.
95.000% 0.686 sec.
99.000% 0.711 sec.
99.900% 0.715 sec.
99.990% 0.716 sec.
```
After
```
QPS: 1.861, RPS: 5582303.107, MiB/s: 676.110, result RPS: 1.861, result MiB/s: 0.000.
0.000% 0.510 sec.
10.000% 0.514 sec.
20.000% 0.517 sec.
30.000% 0.521 sec.
40.000% 0.523 sec.
50.000% 0.527 sec.
60.000% 0.530 sec.
70.000% 0.539 sec.
80.000% 0.558 sec.
90.000% 0.584 sec.
95.000% 0.589 sec.
99.000% 0.608 sec.
99.900% 0.655 sec.
99.990% 0.663 sec.
```
2018-08-27 19:14:15 +00:00
|
|
|
appendToStringOrVector(s, buf, next_pos);
|
2018-08-25 15:32:55 +00:00
|
|
|
buf.position() = next_pos;
|
2015-02-07 23:13:04 +00:00
|
|
|
|
|
|
|
if (!buf.hasPendingData())
|
2011-12-26 02:17:33 +00:00
|
|
|
continue;
|
|
|
|
|
2011-06-15 18:54:18 +00:00
|
|
|
if (*buf.position() == quote)
|
2010-06-04 18:25:25 +00:00
|
|
|
{
|
|
|
|
++buf.position();
|
2017-06-25 03:43:37 +00:00
|
|
|
|
|
|
|
if (enable_sql_style_quoting && !buf.eof() && *buf.position() == quote)
|
|
|
|
{
|
|
|
|
s.push_back(quote);
|
|
|
|
++buf.position();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2022-12-07 21:19:27 +00:00
|
|
|
return ReturnType(true);
|
2010-06-04 18:25:25 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (*buf.position() == '\\')
|
2022-12-07 21:19:27 +00:00
|
|
|
{
|
|
|
|
if constexpr (throw_exception)
|
|
|
|
parseComplexEscapeSequence<Vector, ReturnType>(s, buf);
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (!parseComplexEscapeSequence<Vector, ReturnType>(s, buf))
|
2022-12-12 22:00:45 +00:00
|
|
|
return ReturnType(false);
|
2022-12-07 21:19:27 +00:00
|
|
|
}
|
|
|
|
}
|
2010-06-04 18:25:25 +00:00
|
|
|
}
|
|
|
|
|
2022-12-07 21:19:27 +00:00
|
|
|
if constexpr (throw_exception)
|
2023-11-21 13:13:42 +00:00
|
|
|
throw Exception(ErrorCodes::CANNOT_PARSE_QUOTED_STRING, "Cannot parse quoted string: expected closing quote");
|
2022-12-07 21:19:27 +00:00
|
|
|
else
|
2022-12-12 22:00:45 +00:00
|
|
|
return ReturnType(false);
|
2010-06-04 18:25:25 +00:00
|
|
|
}
|
|
|
|
|
2017-06-25 03:43:37 +00:00
|
|
|
template <bool enable_sql_style_quoting, typename Vector>
|
2016-02-16 16:39:39 +00:00
|
|
|
void readQuotedStringInto(Vector & s, ReadBuffer & buf)
|
|
|
|
{
|
2017-06-25 03:43:37 +00:00
|
|
|
readAnyQuotedStringInto<'\'', enable_sql_style_quoting>(s, buf);
|
2016-02-16 16:39:39 +00:00
|
|
|
}
|
|
|
|
|
2022-12-07 21:19:27 +00:00
|
|
|
template <typename Vector>
|
|
|
|
bool tryReadQuotedStringInto(Vector & s, ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
return readAnyQuotedStringInto<'\'', false, Vector, bool>(s, buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
template bool tryReadQuotedStringInto(String & s, ReadBuffer & buf);
|
|
|
|
|
2017-06-25 03:43:37 +00:00
|
|
|
template <bool enable_sql_style_quoting, typename Vector>
|
2016-02-16 16:39:39 +00:00
|
|
|
void readDoubleQuotedStringInto(Vector & s, ReadBuffer & buf)
|
|
|
|
{
|
2017-06-25 03:43:37 +00:00
|
|
|
readAnyQuotedStringInto<'"', enable_sql_style_quoting>(s, buf);
|
2016-02-16 16:39:39 +00:00
|
|
|
}
|
|
|
|
|
2017-06-25 03:43:37 +00:00
|
|
|
template <bool enable_sql_style_quoting, typename Vector>
|
2016-02-16 16:39:39 +00:00
|
|
|
void readBackQuotedStringInto(Vector & s, ReadBuffer & buf)
|
|
|
|
{
|
2017-06-25 03:43:37 +00:00
|
|
|
readAnyQuotedStringInto<'`', enable_sql_style_quoting>(s, buf);
|
2016-02-16 16:39:39 +00:00
|
|
|
}
|
|
|
|
|
2011-06-15 18:54:18 +00:00
|
|
|
|
|
|
|
void readQuotedString(String & s, ReadBuffer & buf)
|
|
|
|
{
|
2016-02-16 16:39:39 +00:00
|
|
|
s.clear();
|
2017-06-25 03:43:37 +00:00
|
|
|
readQuotedStringInto<false>(s, buf);
|
2011-06-15 18:54:18 +00:00
|
|
|
}
|
|
|
|
|
2017-06-25 03:43:37 +00:00
|
|
|
void readQuotedStringWithSQLStyle(String & s, ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
s.clear();
|
|
|
|
readQuotedStringInto<true>(s, buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template void readQuotedStringInto<true>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
|
2022-03-16 15:28:09 +00:00
|
|
|
template void readQuotedStringInto<true>(String & s, ReadBuffer & buf);
|
2022-05-03 13:56:25 +00:00
|
|
|
template void readQuotedStringInto<false>(String & s, ReadBuffer & buf);
|
2020-10-29 17:22:48 +00:00
|
|
|
template void readDoubleQuotedStringInto<false>(NullOutput & s, ReadBuffer & buf);
|
2022-05-03 13:56:25 +00:00
|
|
|
template void readDoubleQuotedStringInto<false>(String & s, ReadBuffer & buf);
|
2022-05-02 12:07:31 +00:00
|
|
|
template void readBackQuotedStringInto<false>(String & s, ReadBuffer & buf);
|
2016-02-16 16:39:39 +00:00
|
|
|
|
2011-06-15 18:54:18 +00:00
|
|
|
void readDoubleQuotedString(String & s, ReadBuffer & buf)
|
|
|
|
{
|
2016-02-16 16:39:39 +00:00
|
|
|
s.clear();
|
2017-06-25 03:43:37 +00:00
|
|
|
readDoubleQuotedStringInto<false>(s, buf);
|
2011-06-15 18:54:18 +00:00
|
|
|
}
|
|
|
|
|
2017-06-25 03:43:37 +00:00
|
|
|
void readDoubleQuotedStringWithSQLStyle(String & s, ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
s.clear();
|
|
|
|
readDoubleQuotedStringInto<true>(s, buf);
|
|
|
|
}
|
2016-02-16 16:39:39 +00:00
|
|
|
|
2011-11-01 17:57:37 +00:00
|
|
|
void readBackQuotedString(String & s, ReadBuffer & buf)
|
|
|
|
{
|
2016-02-16 16:39:39 +00:00
|
|
|
s.clear();
|
2017-06-25 03:43:37 +00:00
|
|
|
readBackQuotedStringInto<false>(s, buf);
|
2011-11-01 17:57:37 +00:00
|
|
|
}
|
|
|
|
|
2017-06-25 03:43:37 +00:00
|
|
|
void readBackQuotedStringWithSQLStyle(String & s, ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
s.clear();
|
|
|
|
readBackQuotedStringInto<true>(s, buf);
|
|
|
|
}
|
2012-05-08 05:42:05 +00:00
|
|
|
|
2022-05-20 14:57:27 +00:00
|
|
|
template<typename T>
|
|
|
|
concept WithResize = requires (T value)
|
|
|
|
{
|
|
|
|
{ value.resize(1) };
|
|
|
|
{ value.size() } -> std::integral<>;
|
|
|
|
};
|
2016-02-07 08:42:21 +00:00
|
|
|
|
2023-01-05 22:57:25 +00:00
|
|
|
template <typename Vector, bool include_quotes>
|
2018-07-04 21:00:50 +00:00
|
|
|
void readCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV & settings)
|
2016-02-16 16:39:39 +00:00
|
|
|
{
|
2022-08-29 11:18:53 +00:00
|
|
|
/// Empty string
|
2016-02-07 08:42:21 +00:00
|
|
|
if (buf.eof())
|
2022-08-29 11:18:53 +00:00
|
|
|
return;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-07-04 21:09:58 +00:00
|
|
|
const char delimiter = settings.delimiter;
|
2018-06-29 07:34:12 +00:00
|
|
|
const char maybe_quote = *buf.position();
|
2022-11-17 15:21:38 +00:00
|
|
|
const String & custom_delimiter = settings.custom_delimiter;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-03-25 20:12:56 +00:00
|
|
|
/// Emptiness and not even in quotation marks.
|
2022-11-17 15:21:38 +00:00
|
|
|
if (custom_delimiter.empty() && maybe_quote == delimiter)
|
2016-02-07 08:42:21 +00:00
|
|
|
return;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-07-04 21:09:58 +00:00
|
|
|
if ((settings.allow_single_quotes && maybe_quote == '\'') || (settings.allow_double_quotes && maybe_quote == '"'))
|
2016-02-07 08:42:21 +00:00
|
|
|
{
|
2023-01-05 22:57:25 +00:00
|
|
|
if constexpr (include_quotes)
|
|
|
|
s.push_back(maybe_quote);
|
|
|
|
|
2016-02-07 08:42:21 +00:00
|
|
|
++buf.position();
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-03-25 20:12:56 +00:00
|
|
|
/// The quoted case. We are looking for the next quotation mark.
|
2016-02-07 08:42:21 +00:00
|
|
|
while (!buf.eof())
|
|
|
|
{
|
2018-08-25 15:32:55 +00:00
|
|
|
char * next_pos = reinterpret_cast<char *>(memchr(buf.position(), maybe_quote, buf.buffer().end() - buf.position()));
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-02-07 08:42:21 +00:00
|
|
|
if (nullptr == next_pos)
|
|
|
|
next_pos = buf.buffer().end();
|
2017-04-01 07:20:54 +00:00
|
|
|
|
Padding for IO buffers.
Testing data
```
select 'aaaaaaaa','bbbbbbbb','cccccccc','dddddddd','eeeeeeee','ffffffff','gggg','hhh' from numbers(3000000) into outfile '/tmp/test.tsv'
```
Testing command
```
echo "select count() from file('/tmp/test.tsv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
TSV parser has less overhead than CSV, using it would better unveil the benefits of memcpySmall.
Before
```
QPS: 1.662, RPS: 4985463.906, MiB/s: 603.823, result RPS: 1.662, result MiB/s: 0.000.
0.000% 0.559 sec.
10.000% 0.564 sec.
20.000% 0.568 sec.
30.000% 0.572 sec.
40.000% 0.575 sec.
50.000% 0.581 sec.
60.000% 0.592 sec.
70.000% 0.624 sec.
80.000% 0.639 sec.
90.000% 0.664 sec.
95.000% 0.686 sec.
99.000% 0.711 sec.
99.900% 0.715 sec.
99.990% 0.716 sec.
```
After
```
QPS: 1.861, RPS: 5582303.107, MiB/s: 676.110, result RPS: 1.861, result MiB/s: 0.000.
0.000% 0.510 sec.
10.000% 0.514 sec.
20.000% 0.517 sec.
30.000% 0.521 sec.
40.000% 0.523 sec.
50.000% 0.527 sec.
60.000% 0.530 sec.
70.000% 0.539 sec.
80.000% 0.558 sec.
90.000% 0.584 sec.
95.000% 0.589 sec.
99.000% 0.608 sec.
99.900% 0.655 sec.
99.990% 0.663 sec.
```
2018-08-27 19:14:15 +00:00
|
|
|
appendToStringOrVector(s, buf, next_pos);
|
2018-08-25 15:32:55 +00:00
|
|
|
buf.position() = next_pos;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-02-07 08:42:21 +00:00
|
|
|
if (!buf.hasPendingData())
|
|
|
|
continue;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2023-01-05 22:57:25 +00:00
|
|
|
if constexpr (include_quotes)
|
|
|
|
s.push_back(maybe_quote);
|
|
|
|
|
2017-03-25 20:12:56 +00:00
|
|
|
/// Now there is a quotation mark under the cursor. Is there any following?
|
2016-02-07 08:42:21 +00:00
|
|
|
++buf.position();
|
2023-01-05 22:57:25 +00:00
|
|
|
|
2016-02-07 08:42:21 +00:00
|
|
|
if (buf.eof())
|
|
|
|
return;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-02-07 08:42:21 +00:00
|
|
|
if (*buf.position() == maybe_quote)
|
|
|
|
{
|
2016-02-16 16:39:39 +00:00
|
|
|
s.push_back(maybe_quote);
|
2016-02-07 08:42:21 +00:00
|
|
|
++buf.position();
|
|
|
|
continue;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-02-07 08:42:21 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2022-11-17 15:21:38 +00:00
|
|
|
/// If custom_delimiter is specified, we should read until first occurrences of
|
|
|
|
/// custom_delimiter in buffer.
|
|
|
|
if (!custom_delimiter.empty())
|
|
|
|
{
|
|
|
|
PeekableReadBuffer * peekable_buf = dynamic_cast<PeekableReadBuffer *>(&buf);
|
|
|
|
if (!peekable_buf)
|
|
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR, "Reading CSV string with custom delimiter is allowed only when using PeekableReadBuffer");
|
|
|
|
|
|
|
|
while (true)
|
|
|
|
{
|
|
|
|
if (peekable_buf->eof())
|
|
|
|
throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF while reading CSV string, expected custom delimiter \"{}\"", custom_delimiter);
|
|
|
|
|
|
|
|
char * next_pos = reinterpret_cast<char *>(memchr(peekable_buf->position(), custom_delimiter[0], peekable_buf->available()));
|
|
|
|
if (!next_pos)
|
|
|
|
next_pos = peekable_buf->buffer().end();
|
|
|
|
|
|
|
|
appendToStringOrVector(s, *peekable_buf, next_pos);
|
|
|
|
peekable_buf->position() = next_pos;
|
|
|
|
|
|
|
|
if (!buf.hasPendingData())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
{
|
|
|
|
PeekableReadBufferCheckpoint checkpoint{*peekable_buf, true};
|
|
|
|
if (checkString(custom_delimiter, *peekable_buf))
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
s.push_back(*peekable_buf->position());
|
|
|
|
++peekable_buf->position();
|
|
|
|
}
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2023-10-26 13:50:54 +00:00
|
|
|
/// Unquoted case. Look for delimiter or \r (followed by '\n') or \n.
|
2016-02-07 08:42:21 +00:00
|
|
|
while (!buf.eof())
|
|
|
|
{
|
2018-08-25 15:32:55 +00:00
|
|
|
char * next_pos = buf.position();
|
Enable sse2 for CSV parsing.
Testing data
```
select 'aaaaaaaa,bbbbbbbb,cccccccc,dddddddd,eeeeeeee,ffffffff,gggg,hhh' from numbers(3000000) into outfile '/tmp/test.csv'
```
Testing command
```
echo "select count() from file('/tmp/test.csv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
Before
```
QPS: 1.317, RPS: 3949749.687, MiB/s: 478.380, result RPS: 1.317, result MiB/s: 0.000.
0.000% 0.704 sec.
10.000% 0.712 sec.
20.000% 0.718 sec.
30.000% 0.726 sec.
40.000% 0.739 sec.
50.000% 0.754 sec.
60.000% 0.770 sec.
70.000% 0.788 sec.
80.000% 0.798 sec.
90.000% 0.815 sec.
95.000% 0.826 sec.
99.000% 0.850 sec.
99.900% 0.857 sec.
99.990% 0.858 sec.
```
After
```
QPS: 1.533, RPS: 4598308.336, MiB/s: 556.932, result RPS: 1.533, result MiB/s: 0.000.
0.000% 0.626 sec.
10.000% 0.635 sec.
20.000% 0.639 sec.
30.000% 0.642 sec.
40.000% 0.643 sec.
50.000% 0.645 sec.
60.000% 0.649 sec.
70.000% 0.652 sec.
80.000% 0.658 sec.
90.000% 0.682 sec.
95.000% 0.710 sec.
99.000% 0.727 sec.
99.900% 0.733 sec.
99.990% 0.734 sec.
```
2018-08-28 11:03:52 +00:00
|
|
|
|
2018-08-30 21:13:34 +00:00
|
|
|
[&]()
|
|
|
|
{
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __SSE2__
|
Enable sse2 for CSV parsing.
Testing data
```
select 'aaaaaaaa,bbbbbbbb,cccccccc,dddddddd,eeeeeeee,ffffffff,gggg,hhh' from numbers(3000000) into outfile '/tmp/test.csv'
```
Testing command
```
echo "select count() from file('/tmp/test.csv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
Before
```
QPS: 1.317, RPS: 3949749.687, MiB/s: 478.380, result RPS: 1.317, result MiB/s: 0.000.
0.000% 0.704 sec.
10.000% 0.712 sec.
20.000% 0.718 sec.
30.000% 0.726 sec.
40.000% 0.739 sec.
50.000% 0.754 sec.
60.000% 0.770 sec.
70.000% 0.788 sec.
80.000% 0.798 sec.
90.000% 0.815 sec.
95.000% 0.826 sec.
99.000% 0.850 sec.
99.900% 0.857 sec.
99.990% 0.858 sec.
```
After
```
QPS: 1.533, RPS: 4598308.336, MiB/s: 556.932, result RPS: 1.533, result MiB/s: 0.000.
0.000% 0.626 sec.
10.000% 0.635 sec.
20.000% 0.639 sec.
30.000% 0.642 sec.
40.000% 0.643 sec.
50.000% 0.645 sec.
60.000% 0.649 sec.
70.000% 0.652 sec.
80.000% 0.658 sec.
90.000% 0.682 sec.
95.000% 0.710 sec.
99.000% 0.727 sec.
99.900% 0.733 sec.
99.990% 0.734 sec.
```
2018-08-28 11:03:52 +00:00
|
|
|
auto rc = _mm_set1_epi8('\r');
|
|
|
|
auto nc = _mm_set1_epi8('\n');
|
|
|
|
auto dc = _mm_set1_epi8(delimiter);
|
|
|
|
for (; next_pos + 15 < buf.buffer().end(); next_pos += 16)
|
|
|
|
{
|
|
|
|
__m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i *>(next_pos));
|
|
|
|
auto eq = _mm_or_si128(_mm_or_si128(_mm_cmpeq_epi8(bytes, rc), _mm_cmpeq_epi8(bytes, nc)), _mm_cmpeq_epi8(bytes, dc));
|
|
|
|
uint16_t bit_mask = _mm_movemask_epi8(eq);
|
|
|
|
if (bit_mask)
|
|
|
|
{
|
2022-07-31 14:34:05 +00:00
|
|
|
next_pos += std::countr_zero(bit_mask);
|
Enable sse2 for CSV parsing.
Testing data
```
select 'aaaaaaaa,bbbbbbbb,cccccccc,dddddddd,eeeeeeee,ffffffff,gggg,hhh' from numbers(3000000) into outfile '/tmp/test.csv'
```
Testing command
```
echo "select count() from file('/tmp/test.csv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
Before
```
QPS: 1.317, RPS: 3949749.687, MiB/s: 478.380, result RPS: 1.317, result MiB/s: 0.000.
0.000% 0.704 sec.
10.000% 0.712 sec.
20.000% 0.718 sec.
30.000% 0.726 sec.
40.000% 0.739 sec.
50.000% 0.754 sec.
60.000% 0.770 sec.
70.000% 0.788 sec.
80.000% 0.798 sec.
90.000% 0.815 sec.
95.000% 0.826 sec.
99.000% 0.850 sec.
99.900% 0.857 sec.
99.990% 0.858 sec.
```
After
```
QPS: 1.533, RPS: 4598308.336, MiB/s: 556.932, result RPS: 1.533, result MiB/s: 0.000.
0.000% 0.626 sec.
10.000% 0.635 sec.
20.000% 0.639 sec.
30.000% 0.642 sec.
40.000% 0.643 sec.
50.000% 0.645 sec.
60.000% 0.649 sec.
70.000% 0.652 sec.
80.000% 0.658 sec.
90.000% 0.682 sec.
95.000% 0.710 sec.
99.000% 0.727 sec.
99.900% 0.733 sec.
99.990% 0.734 sec.
```
2018-08-28 11:03:52 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
2022-06-15 13:19:29 +00:00
|
|
|
#elif defined(__aarch64__) && defined(__ARM_NEON)
|
|
|
|
auto rc = vdupq_n_u8('\r');
|
|
|
|
auto nc = vdupq_n_u8('\n');
|
|
|
|
auto dc = vdupq_n_u8(delimiter);
|
|
|
|
for (; next_pos + 15 < buf.buffer().end(); next_pos += 16)
|
|
|
|
{
|
|
|
|
uint8x16_t bytes = vld1q_u8(reinterpret_cast<const uint8_t *>(next_pos));
|
|
|
|
auto eq = vorrq_u8(vorrq_u8(vceqq_u8(bytes, rc), vceqq_u8(bytes, nc)), vceqq_u8(bytes, dc));
|
2023-08-19 14:53:05 +00:00
|
|
|
uint64_t bit_mask = getNibbleMask(eq);
|
2022-06-15 13:19:29 +00:00
|
|
|
if (bit_mask)
|
|
|
|
{
|
2022-07-31 14:34:05 +00:00
|
|
|
next_pos += std::countr_zero(bit_mask) >> 2;
|
2022-06-15 13:19:29 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
Enable sse2 for CSV parsing.
Testing data
```
select 'aaaaaaaa,bbbbbbbb,cccccccc,dddddddd,eeeeeeee,ffffffff,gggg,hhh' from numbers(3000000) into outfile '/tmp/test.csv'
```
Testing command
```
echo "select count() from file('/tmp/test.csv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
Before
```
QPS: 1.317, RPS: 3949749.687, MiB/s: 478.380, result RPS: 1.317, result MiB/s: 0.000.
0.000% 0.704 sec.
10.000% 0.712 sec.
20.000% 0.718 sec.
30.000% 0.726 sec.
40.000% 0.739 sec.
50.000% 0.754 sec.
60.000% 0.770 sec.
70.000% 0.788 sec.
80.000% 0.798 sec.
90.000% 0.815 sec.
95.000% 0.826 sec.
99.000% 0.850 sec.
99.900% 0.857 sec.
99.990% 0.858 sec.
```
After
```
QPS: 1.533, RPS: 4598308.336, MiB/s: 556.932, result RPS: 1.533, result MiB/s: 0.000.
0.000% 0.626 sec.
10.000% 0.635 sec.
20.000% 0.639 sec.
30.000% 0.642 sec.
40.000% 0.643 sec.
50.000% 0.645 sec.
60.000% 0.649 sec.
70.000% 0.652 sec.
80.000% 0.658 sec.
90.000% 0.682 sec.
95.000% 0.710 sec.
99.000% 0.727 sec.
99.900% 0.733 sec.
99.990% 0.734 sec.
```
2018-08-28 11:03:52 +00:00
|
|
|
#endif
|
|
|
|
while (next_pos < buf.buffer().end()
|
|
|
|
&& *next_pos != delimiter && *next_pos != '\r' && *next_pos != '\n')
|
|
|
|
++next_pos;
|
|
|
|
}();
|
|
|
|
|
Padding for IO buffers.
Testing data
```
select 'aaaaaaaa','bbbbbbbb','cccccccc','dddddddd','eeeeeeee','ffffffff','gggg','hhh' from numbers(3000000) into outfile '/tmp/test.tsv'
```
Testing command
```
echo "select count() from file('/tmp/test.tsv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
TSV parser has less overhead than CSV, using it would better unveil the benefits of memcpySmall.
Before
```
QPS: 1.662, RPS: 4985463.906, MiB/s: 603.823, result RPS: 1.662, result MiB/s: 0.000.
0.000% 0.559 sec.
10.000% 0.564 sec.
20.000% 0.568 sec.
30.000% 0.572 sec.
40.000% 0.575 sec.
50.000% 0.581 sec.
60.000% 0.592 sec.
70.000% 0.624 sec.
80.000% 0.639 sec.
90.000% 0.664 sec.
95.000% 0.686 sec.
99.000% 0.711 sec.
99.900% 0.715 sec.
99.990% 0.716 sec.
```
After
```
QPS: 1.861, RPS: 5582303.107, MiB/s: 676.110, result RPS: 1.861, result MiB/s: 0.000.
0.000% 0.510 sec.
10.000% 0.514 sec.
20.000% 0.517 sec.
30.000% 0.521 sec.
40.000% 0.523 sec.
50.000% 0.527 sec.
60.000% 0.530 sec.
70.000% 0.539 sec.
80.000% 0.558 sec.
90.000% 0.584 sec.
95.000% 0.589 sec.
99.000% 0.608 sec.
99.900% 0.655 sec.
99.990% 0.663 sec.
```
2018-08-27 19:14:15 +00:00
|
|
|
appendToStringOrVector(s, buf, next_pos);
|
2018-08-25 15:32:55 +00:00
|
|
|
buf.position() = next_pos;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-02-07 08:42:21 +00:00
|
|
|
if (!buf.hasPendingData())
|
|
|
|
continue;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2023-10-26 13:50:54 +00:00
|
|
|
/// Check for single '\r' not followed by '\n'
|
|
|
|
/// We should not stop in this case.
|
2023-11-07 01:45:46 +00:00
|
|
|
if (*buf.position() == '\r' && !settings.allow_cr_end_of_line)
|
2023-10-26 13:50:54 +00:00
|
|
|
{
|
|
|
|
++buf.position();
|
|
|
|
if (!buf.eof() && *buf.position() != '\n')
|
|
|
|
{
|
|
|
|
s.push_back('\r');
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-05-20 14:57:27 +00:00
|
|
|
if constexpr (WithResize<Vector>)
|
2022-05-13 13:51:28 +00:00
|
|
|
{
|
2023-05-25 07:51:32 +00:00
|
|
|
if (settings.trim_whitespaces) [[likely]]
|
|
|
|
{
|
|
|
|
/** CSV format can contain insignificant spaces and tabs.
|
|
|
|
* Usually the task of skipping them is for the calling code.
|
|
|
|
* But in this case, it will be difficult to do this, so remove the trailing whitespace by ourself.
|
|
|
|
*/
|
|
|
|
size_t size = s.size();
|
|
|
|
while (size > 0 && (s[size - 1] == ' ' || s[size - 1] == '\t'))
|
|
|
|
--size;
|
|
|
|
|
|
|
|
s.resize(size);
|
|
|
|
}
|
2022-05-13 13:51:28 +00:00
|
|
|
}
|
2016-02-07 08:42:21 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-07-04 21:00:50 +00:00
|
|
|
void readCSVString(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings)
|
2016-02-16 16:39:39 +00:00
|
|
|
{
|
|
|
|
s.clear();
|
2018-07-04 21:00:50 +00:00
|
|
|
readCSVStringInto(s, buf, settings);
|
2016-02-16 16:39:39 +00:00
|
|
|
}
|
|
|
|
|
2021-12-15 11:30:57 +00:00
|
|
|
void readCSVField(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings)
|
|
|
|
{
|
|
|
|
s.clear();
|
2023-01-05 22:57:25 +00:00
|
|
|
readCSVStringInto<String, true>(s, buf, settings);
|
2021-12-15 11:30:57 +00:00
|
|
|
}
|
|
|
|
|
2022-11-17 15:21:38 +00:00
|
|
|
void readCSVWithTwoPossibleDelimitersImpl(String & s, PeekableReadBuffer & buf, const String & first_delimiter, const String & second_delimiter)
|
|
|
|
{
|
2022-12-08 20:00:10 +00:00
|
|
|
/// Check that delimiters are not empty.
|
|
|
|
if (first_delimiter.empty() || second_delimiter.empty())
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
|
|
|
"Cannot read CSV field with two possible delimiters, one "
|
|
|
|
"of delimiters '{}' and '{}' is empty", first_delimiter, second_delimiter);
|
2022-12-08 20:00:10 +00:00
|
|
|
|
2022-11-17 15:21:38 +00:00
|
|
|
/// Read all data until first_delimiter or second_delimiter
|
|
|
|
while (true)
|
|
|
|
{
|
|
|
|
if (buf.eof())
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::INCORRECT_DATA, R"(Unexpected EOF while reading CSV string, expected on "
|
|
|
|
"of delimiters "{}" or "{}")", first_delimiter, second_delimiter);
|
2022-11-17 15:21:38 +00:00
|
|
|
|
|
|
|
char * next_pos = buf.position();
|
|
|
|
while (next_pos != buf.buffer().end() && *next_pos != first_delimiter[0] && *next_pos != second_delimiter[0])
|
|
|
|
++next_pos;
|
|
|
|
|
|
|
|
appendToStringOrVector(s, buf, next_pos);
|
|
|
|
buf.position() = next_pos;
|
|
|
|
if (!buf.hasPendingData())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (*buf.position() == first_delimiter[0])
|
|
|
|
{
|
|
|
|
PeekableReadBufferCheckpoint checkpoint(buf, true);
|
|
|
|
if (checkString(first_delimiter, buf))
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (*buf.position() == second_delimiter[0])
|
|
|
|
{
|
|
|
|
PeekableReadBufferCheckpoint checkpoint(buf, true);
|
|
|
|
if (checkString(second_delimiter, buf))
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
s.push_back(*buf.position());
|
|
|
|
++buf.position();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
String readCSVStringWithTwoPossibleDelimiters(PeekableReadBuffer & buf, const FormatSettings::CSV & settings, const String & first_delimiter, const String & second_delimiter)
|
|
|
|
{
|
|
|
|
String res;
|
|
|
|
|
|
|
|
/// If value is quoted, use regular CSV reading since we need to read only data inside quotes.
|
|
|
|
if (!buf.eof() && ((settings.allow_single_quotes && *buf.position() == '\'') || (settings.allow_double_quotes && *buf.position() == '"')))
|
|
|
|
readCSVStringInto(res, buf, settings);
|
|
|
|
else
|
|
|
|
readCSVWithTwoPossibleDelimitersImpl(res, buf, first_delimiter, second_delimiter);
|
|
|
|
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
String readCSVFieldWithTwoPossibleDelimiters(PeekableReadBuffer & buf, const FormatSettings::CSV & settings, const String & first_delimiter, const String & second_delimiter)
|
|
|
|
{
|
|
|
|
String res;
|
|
|
|
|
|
|
|
/// If value is quoted, use regular CSV reading since we need to read only data inside quotes.
|
|
|
|
if (!buf.eof() && ((settings.allow_single_quotes && *buf.position() == '\'') || (settings.allow_double_quotes && *buf.position() == '"')))
|
|
|
|
readCSVField(res, buf, settings);
|
|
|
|
else
|
|
|
|
readCSVWithTwoPossibleDelimitersImpl(res, buf, first_delimiter, second_delimiter);
|
|
|
|
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
2018-07-04 21:00:50 +00:00
|
|
|
template void readCSVStringInto<PaddedPODArray<UInt8>>(PaddedPODArray<UInt8> & s, ReadBuffer & buf, const FormatSettings::CSV & settings);
|
2022-05-13 13:51:28 +00:00
|
|
|
template void readCSVStringInto<NullOutput>(NullOutput & s, ReadBuffer & buf, const FormatSettings::CSV & settings);
|
2016-02-16 16:39:39 +00:00
|
|
|
|
2016-02-07 08:42:21 +00:00
|
|
|
|
2017-08-09 01:34:01 +00:00
|
|
|
template <typename Vector, typename ReturnType>
|
|
|
|
ReturnType readJSONStringInto(Vector & s, ReadBuffer & buf)
|
2016-02-18 11:44:50 +00:00
|
|
|
{
|
2017-12-25 04:01:46 +00:00
|
|
|
static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
|
2017-08-09 01:34:01 +00:00
|
|
|
|
2023-01-26 09:52:47 +00:00
|
|
|
auto error = [](FormatStringHelper<> message [[maybe_unused]], int code [[maybe_unused]])
|
2017-08-09 01:34:01 +00:00
|
|
|
{
|
2018-07-14 23:39:00 +00:00
|
|
|
if constexpr (throw_exception)
|
2023-11-21 13:13:42 +00:00
|
|
|
throw Exception(code, std::move(message));
|
2017-08-09 01:34:01 +00:00
|
|
|
return ReturnType(false);
|
|
|
|
};
|
|
|
|
|
2016-02-18 11:44:50 +00:00
|
|
|
if (buf.eof() || *buf.position() != '"')
|
2017-08-09 01:34:01 +00:00
|
|
|
return error("Cannot parse JSON string: expected opening quote", ErrorCodes::CANNOT_PARSE_QUOTED_STRING);
|
2016-02-18 11:44:50 +00:00
|
|
|
++buf.position();
|
|
|
|
|
|
|
|
while (!buf.eof())
|
|
|
|
{
|
2018-08-25 15:32:55 +00:00
|
|
|
char * next_pos = find_first_symbols<'\\', '"'>(buf.position(), buf.buffer().end());
|
2016-02-18 11:44:50 +00:00
|
|
|
|
Padding for IO buffers.
Testing data
```
select 'aaaaaaaa','bbbbbbbb','cccccccc','dddddddd','eeeeeeee','ffffffff','gggg','hhh' from numbers(3000000) into outfile '/tmp/test.tsv'
```
Testing command
```
echo "select count() from file('/tmp/test.tsv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
TSV parser has less overhead than CSV, using it would better unveil the benefits of memcpySmall.
Before
```
QPS: 1.662, RPS: 4985463.906, MiB/s: 603.823, result RPS: 1.662, result MiB/s: 0.000.
0.000% 0.559 sec.
10.000% 0.564 sec.
20.000% 0.568 sec.
30.000% 0.572 sec.
40.000% 0.575 sec.
50.000% 0.581 sec.
60.000% 0.592 sec.
70.000% 0.624 sec.
80.000% 0.639 sec.
90.000% 0.664 sec.
95.000% 0.686 sec.
99.000% 0.711 sec.
99.900% 0.715 sec.
99.990% 0.716 sec.
```
After
```
QPS: 1.861, RPS: 5582303.107, MiB/s: 676.110, result RPS: 1.861, result MiB/s: 0.000.
0.000% 0.510 sec.
10.000% 0.514 sec.
20.000% 0.517 sec.
30.000% 0.521 sec.
40.000% 0.523 sec.
50.000% 0.527 sec.
60.000% 0.530 sec.
70.000% 0.539 sec.
80.000% 0.558 sec.
90.000% 0.584 sec.
95.000% 0.589 sec.
99.000% 0.608 sec.
99.900% 0.655 sec.
99.990% 0.663 sec.
```
2018-08-27 19:14:15 +00:00
|
|
|
appendToStringOrVector(s, buf, next_pos);
|
2018-08-25 15:32:55 +00:00
|
|
|
buf.position() = next_pos;
|
2016-02-18 11:44:50 +00:00
|
|
|
|
|
|
|
if (!buf.hasPendingData())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (*buf.position() == '"')
|
|
|
|
{
|
|
|
|
++buf.position();
|
2017-08-09 01:34:01 +00:00
|
|
|
return ReturnType(true);
|
2016-02-18 11:44:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (*buf.position() == '\\')
|
2017-08-09 01:34:01 +00:00
|
|
|
parseJSONEscapeSequence<Vector, ReturnType>(s, buf);
|
2016-02-18 11:44:50 +00:00
|
|
|
}
|
|
|
|
|
2017-08-09 01:34:01 +00:00
|
|
|
return error("Cannot parse JSON string: expected closing quote", ErrorCodes::CANNOT_PARSE_QUOTED_STRING);
|
2016-02-18 11:44:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void readJSONString(String & s, ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
s.clear();
|
|
|
|
readJSONStringInto(s, buf);
|
|
|
|
}
|
|
|
|
|
2017-08-09 01:34:01 +00:00
|
|
|
template void readJSONStringInto<PaddedPODArray<UInt8>, void>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
|
|
|
|
template bool readJSONStringInto<PaddedPODArray<UInt8>, bool>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
|
2020-10-29 17:22:48 +00:00
|
|
|
template void readJSONStringInto<NullOutput>(NullOutput & s, ReadBuffer & buf);
|
2018-09-14 12:15:32 +00:00
|
|
|
template void readJSONStringInto<String>(String & s, ReadBuffer & buf);
|
2022-12-07 21:19:27 +00:00
|
|
|
template bool readJSONStringInto<String, bool>(String & s, ReadBuffer & buf);
|
2016-02-18 11:44:50 +00:00
|
|
|
|
2023-09-25 15:42:59 +00:00
|
|
|
template <typename Vector, typename ReturnType, char opening_bracket, char closing_bracket>
|
|
|
|
ReturnType readJSONObjectOrArrayPossiblyInvalid(Vector & s, ReadBuffer & buf)
|
2021-04-23 12:53:38 +00:00
|
|
|
{
|
|
|
|
static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
|
|
|
|
|
2023-01-26 09:52:47 +00:00
|
|
|
auto error = [](FormatStringHelper<> message [[maybe_unused]], int code [[maybe_unused]])
|
2021-04-23 12:53:38 +00:00
|
|
|
{
|
|
|
|
if constexpr (throw_exception)
|
2023-11-21 13:13:42 +00:00
|
|
|
throw Exception(code, std::move(message));
|
2021-04-23 12:53:38 +00:00
|
|
|
return ReturnType(false);
|
|
|
|
};
|
|
|
|
|
2023-09-25 15:42:59 +00:00
|
|
|
if (buf.eof() || *buf.position() != opening_bracket)
|
|
|
|
return error("JSON object/array should start with corresponding opening bracket", ErrorCodes::INCORRECT_DATA);
|
2021-04-23 12:53:38 +00:00
|
|
|
|
|
|
|
s.push_back(*buf.position());
|
|
|
|
++buf.position();
|
2021-09-09 21:16:57 +00:00
|
|
|
|
2021-04-23 12:53:38 +00:00
|
|
|
Int64 balance = 1;
|
2021-09-09 21:16:57 +00:00
|
|
|
bool quotes = false;
|
|
|
|
|
2021-04-23 12:53:38 +00:00
|
|
|
while (!buf.eof())
|
|
|
|
{
|
2023-09-25 15:42:59 +00:00
|
|
|
char * next_pos = find_first_symbols<'\\', opening_bracket, closing_bracket, '"'>(buf.position(), buf.buffer().end());
|
2021-04-23 12:53:38 +00:00
|
|
|
appendToStringOrVector(s, buf, next_pos);
|
|
|
|
buf.position() = next_pos;
|
|
|
|
|
|
|
|
if (!buf.hasPendingData())
|
|
|
|
continue;
|
|
|
|
|
2021-09-09 21:16:57 +00:00
|
|
|
s.push_back(*buf.position());
|
|
|
|
|
2021-04-23 12:53:38 +00:00
|
|
|
if (*buf.position() == '\\')
|
|
|
|
{
|
2021-09-09 21:16:57 +00:00
|
|
|
++buf.position();
|
|
|
|
if (!buf.eof())
|
|
|
|
{
|
|
|
|
s.push_back(*buf.position());
|
|
|
|
++buf.position();
|
|
|
|
}
|
|
|
|
|
2021-04-23 12:53:38 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2021-09-09 21:16:57 +00:00
|
|
|
if (*buf.position() == '"')
|
|
|
|
quotes = !quotes;
|
2023-09-25 15:42:59 +00:00
|
|
|
else if (!quotes) // can be only opening_bracket or closing_bracket
|
|
|
|
balance += *buf.position() == opening_bracket ? 1 : -1;
|
2021-04-23 12:53:38 +00:00
|
|
|
|
|
|
|
++buf.position();
|
2021-05-03 00:56:19 +00:00
|
|
|
|
2021-04-23 12:53:38 +00:00
|
|
|
if (balance == 0)
|
|
|
|
return ReturnType(true);
|
2021-05-03 00:56:19 +00:00
|
|
|
|
2023-09-25 15:42:59 +00:00
|
|
|
if (balance < 0)
|
2021-05-03 00:56:19 +00:00
|
|
|
break;
|
2021-04-23 12:53:38 +00:00
|
|
|
}
|
|
|
|
|
2023-09-25 15:42:59 +00:00
|
|
|
return error("JSON object/array should have equal number of opening and closing brackets", ErrorCodes::INCORRECT_DATA);
|
|
|
|
}
|
|
|
|
|
|
|
|
template <typename Vector, typename ReturnType>
|
|
|
|
ReturnType readJSONObjectPossiblyInvalid(Vector & s, ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
return readJSONObjectOrArrayPossiblyInvalid<Vector, ReturnType, '{', '}'>(s, buf);
|
2021-04-23 12:53:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
template void readJSONObjectPossiblyInvalid<String>(String & s, ReadBuffer & buf);
|
2023-09-26 16:41:35 +00:00
|
|
|
template void readJSONObjectPossiblyInvalid<PaddedPODArray<UInt8>>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
|
2016-02-18 11:44:50 +00:00
|
|
|
|
2023-09-26 16:41:35 +00:00
|
|
|
template <typename Vector>
|
|
|
|
void readJSONArrayInto(Vector & s, ReadBuffer & buf)
|
2023-09-25 15:42:59 +00:00
|
|
|
{
|
2023-09-26 16:41:35 +00:00
|
|
|
readJSONObjectOrArrayPossiblyInvalid<Vector, void, '[', ']'>(s, buf);
|
2023-09-25 15:42:59 +00:00
|
|
|
}
|
|
|
|
|
2023-09-26 16:41:35 +00:00
|
|
|
template void readJSONArrayInto<PaddedPODArray<UInt8>>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
|
|
|
|
|
2018-07-11 21:43:09 +00:00
|
|
|
template <typename ReturnType>
|
|
|
|
ReturnType readDateTextFallback(LocalDate & date, ReadBuffer & buf)
|
2017-11-15 02:08:55 +00:00
|
|
|
{
|
2018-07-11 21:43:09 +00:00
|
|
|
static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
|
|
|
|
|
2018-07-14 23:39:00 +00:00
|
|
|
auto error = []
|
|
|
|
{
|
|
|
|
if constexpr (throw_exception)
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::CANNOT_PARSE_DATE, "Cannot parse date: value is too short");
|
2018-07-14 23:39:00 +00:00
|
|
|
return ReturnType(false);
|
|
|
|
};
|
2017-11-15 02:08:55 +00:00
|
|
|
|
2018-07-14 23:39:00 +00:00
|
|
|
auto append_digit = [&](auto & x)
|
2017-11-15 02:08:55 +00:00
|
|
|
{
|
2018-07-14 23:39:00 +00:00
|
|
|
if (!buf.eof() && isNumericASCII(*buf.position()))
|
|
|
|
{
|
|
|
|
x = x * 10 + (*buf.position() - '0');
|
|
|
|
++buf.position();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
return false;
|
|
|
|
};
|
|
|
|
|
|
|
|
UInt16 year = 0;
|
2021-10-29 21:23:03 +00:00
|
|
|
UInt8 month = 0;
|
|
|
|
UInt8 day = 0;
|
|
|
|
|
2018-07-14 23:39:00 +00:00
|
|
|
if (!append_digit(year)
|
2020-03-18 03:27:32 +00:00
|
|
|
|| !append_digit(year) // NOLINT
|
|
|
|
|| !append_digit(year) // NOLINT
|
|
|
|
|| !append_digit(year)) // NOLINT
|
2018-07-14 23:39:00 +00:00
|
|
|
return error();
|
|
|
|
|
2021-10-29 21:23:03 +00:00
|
|
|
if (buf.eof())
|
2018-07-14 23:39:00 +00:00
|
|
|
return error();
|
|
|
|
|
2021-10-29 21:23:03 +00:00
|
|
|
if (isNumericASCII(*buf.position()))
|
|
|
|
{
|
|
|
|
/// YYYYMMDD
|
|
|
|
if (!append_digit(month)
|
|
|
|
|| !append_digit(month) // NOLINT
|
|
|
|
|| !append_digit(day)
|
|
|
|
|| !append_digit(day)) // NOLINT
|
|
|
|
return error();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
++buf.position();
|
2018-07-14 23:39:00 +00:00
|
|
|
|
2021-10-29 21:23:03 +00:00
|
|
|
if (!append_digit(month))
|
|
|
|
return error();
|
|
|
|
append_digit(month);
|
2018-07-14 23:39:00 +00:00
|
|
|
|
2021-10-29 21:23:03 +00:00
|
|
|
if (!buf.eof() && !isNumericASCII(*buf.position()))
|
|
|
|
++buf.position();
|
|
|
|
else
|
|
|
|
return error();
|
|
|
|
|
|
|
|
if (!append_digit(day))
|
|
|
|
return error();
|
|
|
|
append_digit(day);
|
|
|
|
}
|
2017-11-15 02:08:55 +00:00
|
|
|
|
|
|
|
date = LocalDate(year, month, day);
|
2018-07-11 21:43:09 +00:00
|
|
|
return ReturnType(true);
|
2017-11-15 02:08:55 +00:00
|
|
|
}
|
|
|
|
|
2018-07-11 21:43:09 +00:00
|
|
|
template void readDateTextFallback<void>(LocalDate &, ReadBuffer &);
|
|
|
|
template bool readDateTextFallback<bool>(LocalDate &, ReadBuffer &);
|
|
|
|
|
2017-11-15 02:08:55 +00:00
|
|
|
|
2023-10-14 10:17:35 +00:00
|
|
|
template <typename ReturnType, bool dt64_mode>
|
2018-07-11 21:43:09 +00:00
|
|
|
ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const DateLUTImpl & date_lut)
|
2015-04-01 02:55:52 +00:00
|
|
|
{
|
2018-07-11 21:43:09 +00:00
|
|
|
static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
|
|
|
|
|
2020-10-16 21:31:29 +00:00
|
|
|
/// YYYY-MM-DD
|
|
|
|
static constexpr auto date_broken_down_length = 10;
|
2022-08-21 21:12:41 +00:00
|
|
|
/// hh:mm:ss
|
|
|
|
static constexpr auto time_broken_down_length = 8;
|
|
|
|
/// YYYY-MM-DD hh:mm:ss
|
|
|
|
static constexpr auto date_time_broken_down_length = date_broken_down_length + 1 + time_broken_down_length;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-03-23 02:12:31 +00:00
|
|
|
char s[date_time_broken_down_length];
|
2015-10-21 19:04:02 +00:00
|
|
|
char * s_pos = s;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2021-02-21 12:34:04 +00:00
|
|
|
/** Read characters, that could represent unix timestamp.
|
2023-10-14 10:17:35 +00:00
|
|
|
* Only unix timestamp of at least 5 characters is supported by default, exception is thrown for a shorter one
|
2023-10-20 20:25:58 +00:00
|
|
|
* (unless parsing a string like '1.23' or '-12': there is no ambiguity, it is a DT64 timestamp).
|
2021-02-21 12:34:04 +00:00
|
|
|
* Then look at 5th character. If it is a number - treat whole as unix timestamp.
|
|
|
|
* If it is not a number - then parse datetime in YYYY-MM-DD hh:mm:ss or YYYY-MM-DD format.
|
|
|
|
*/
|
|
|
|
|
2023-10-14 10:17:35 +00:00
|
|
|
int negative_multiplier = 1;
|
|
|
|
|
|
|
|
if (!buf.eof() && *buf.position() == '-')
|
|
|
|
{
|
|
|
|
if constexpr (dt64_mode)
|
|
|
|
{
|
|
|
|
negative_multiplier = -1;
|
|
|
|
++buf.position();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if constexpr (throw_exception)
|
2023-11-21 13:13:42 +00:00
|
|
|
throw Exception(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot parse DateTime");
|
2023-10-14 10:17:35 +00:00
|
|
|
else
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-02-21 12:34:04 +00:00
|
|
|
/// A piece similar to unix timestamp, maybe scaled to subsecond precision.
|
|
|
|
while (s_pos < s + date_time_broken_down_length && !buf.eof() && isNumericASCII(*buf.position()))
|
2015-04-01 02:55:52 +00:00
|
|
|
{
|
2015-10-21 19:04:02 +00:00
|
|
|
*s_pos = *buf.position();
|
|
|
|
++s_pos;
|
|
|
|
++buf.position();
|
2015-04-01 02:55:52 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-10-16 21:31:29 +00:00
|
|
|
/// 2015-01-01 01:02:03 or 2015-01-01
|
2023-10-14 10:17:35 +00:00
|
|
|
/// if negative, it is a timestamp with no ambiguity
|
|
|
|
if (negative_multiplier == 1 && s_pos == s + 4 && !buf.eof() && !isNumericASCII(*buf.position()))
|
2015-04-01 02:55:52 +00:00
|
|
|
{
|
2020-10-16 21:31:29 +00:00
|
|
|
const auto already_read_length = s_pos - s;
|
|
|
|
const size_t remaining_date_size = date_broken_down_length - already_read_length;
|
|
|
|
|
2022-08-21 21:12:41 +00:00
|
|
|
size_t size = buf.read(s_pos, remaining_date_size);
|
|
|
|
if (size != remaining_date_size)
|
2015-04-01 02:55:52 +00:00
|
|
|
{
|
2015-10-21 19:04:02 +00:00
|
|
|
s_pos[size] = 0;
|
2018-07-11 21:43:09 +00:00
|
|
|
|
|
|
|
if constexpr (throw_exception)
|
2023-11-21 13:13:42 +00:00
|
|
|
throw Exception(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot parse DateTime {}", s);
|
2018-07-11 21:43:09 +00:00
|
|
|
else
|
|
|
|
return false;
|
2015-04-01 02:55:52 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2015-04-01 02:55:52 +00:00
|
|
|
UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0');
|
|
|
|
UInt8 month = (s[5] - '0') * 10 + (s[6] - '0');
|
|
|
|
UInt8 day = (s[8] - '0') * 10 + (s[9] - '0');
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-10-16 21:31:29 +00:00
|
|
|
UInt8 hour = 0;
|
|
|
|
UInt8 minute = 0;
|
|
|
|
UInt8 second = 0;
|
|
|
|
|
2022-08-21 21:12:41 +00:00
|
|
|
if (!buf.eof() && (*buf.position() == ' ' || *buf.position() == 'T'))
|
2020-10-16 21:31:29 +00:00
|
|
|
{
|
2022-08-21 21:12:41 +00:00
|
|
|
++buf.position();
|
|
|
|
size = buf.read(s, time_broken_down_length);
|
|
|
|
|
|
|
|
if (size != time_broken_down_length)
|
|
|
|
{
|
|
|
|
s_pos[size] = 0;
|
|
|
|
|
|
|
|
if constexpr (throw_exception)
|
2023-11-21 13:13:42 +00:00
|
|
|
throw Exception(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot parse time component of DateTime {}", s);
|
2022-08-21 21:12:41 +00:00
|
|
|
else
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
hour = (s[0] - '0') * 10 + (s[1] - '0');
|
|
|
|
minute = (s[3] - '0') * 10 + (s[4] - '0');
|
|
|
|
second = (s[6] - '0') * 10 + (s[7] - '0');
|
2020-10-16 21:31:29 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2015-04-01 02:55:52 +00:00
|
|
|
if (unlikely(year == 0))
|
|
|
|
datetime = 0;
|
|
|
|
else
|
2017-01-22 08:33:16 +00:00
|
|
|
datetime = date_lut.makeDateTime(year, month, day, hour, minute, second);
|
2015-04-01 02:55:52 +00:00
|
|
|
}
|
|
|
|
else
|
2018-07-14 23:39:00 +00:00
|
|
|
{
|
2023-10-14 10:17:35 +00:00
|
|
|
datetime = 0;
|
|
|
|
bool too_short = s_pos - s <= 4;
|
|
|
|
|
|
|
|
if (!too_short || dt64_mode)
|
2018-07-14 23:39:00 +00:00
|
|
|
{
|
|
|
|
/// Not very efficient.
|
|
|
|
for (const char * digit_pos = s; digit_pos < s_pos; ++digit_pos)
|
|
|
|
datetime = datetime * 10 + *digit_pos - '0';
|
|
|
|
}
|
2023-10-14 10:17:35 +00:00
|
|
|
datetime *= negative_multiplier;
|
|
|
|
|
|
|
|
if (too_short && negative_multiplier != -1)
|
2018-07-14 23:39:00 +00:00
|
|
|
{
|
|
|
|
if constexpr (throw_exception)
|
2023-11-21 13:13:42 +00:00
|
|
|
throw Exception(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot parse DateTime");
|
2018-07-14 23:39:00 +00:00
|
|
|
else
|
|
|
|
return false;
|
|
|
|
}
|
2023-10-14 10:17:35 +00:00
|
|
|
|
2018-07-14 23:39:00 +00:00
|
|
|
}
|
2018-07-11 21:43:09 +00:00
|
|
|
|
|
|
|
return ReturnType(true);
|
2015-04-01 02:55:52 +00:00
|
|
|
}
|
|
|
|
|
2023-10-14 10:17:35 +00:00
|
|
|
template void readDateTimeTextFallback<void, false>(time_t &, ReadBuffer &, const DateLUTImpl &);
|
|
|
|
template void readDateTimeTextFallback<void, true>(time_t &, ReadBuffer &, const DateLUTImpl &);
|
|
|
|
template bool readDateTimeTextFallback<bool, false>(time_t &, ReadBuffer &, const DateLUTImpl &);
|
|
|
|
template bool readDateTimeTextFallback<bool, true>(time_t &, ReadBuffer &, const DateLUTImpl &);
|
2018-07-11 21:43:09 +00:00
|
|
|
|
2015-04-01 02:55:52 +00:00
|
|
|
|
2022-07-15 11:15:46 +00:00
|
|
|
void skipJSONField(ReadBuffer & buf, StringRef name_of_field)
|
2016-09-20 19:11:25 +00:00
|
|
|
{
|
|
|
|
if (buf.eof())
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF for key '{}'", name_of_field.toString());
|
2016-09-20 19:11:25 +00:00
|
|
|
else if (*buf.position() == '"') /// skip double-quoted string
|
|
|
|
{
|
2020-10-29 17:22:48 +00:00
|
|
|
NullOutput sink;
|
2016-09-20 19:11:25 +00:00
|
|
|
readJSONStringInto(sink, buf);
|
|
|
|
}
|
2018-01-14 00:12:23 +00:00
|
|
|
else if (isNumericASCII(*buf.position()) || *buf.position() == '-' || *buf.position() == '+' || *buf.position() == '.') /// skip number
|
2016-09-20 19:11:25 +00:00
|
|
|
{
|
2018-01-14 00:12:23 +00:00
|
|
|
if (*buf.position() == '+')
|
|
|
|
++buf.position();
|
|
|
|
|
2016-09-20 19:11:25 +00:00
|
|
|
double v;
|
|
|
|
if (!tryReadFloatText(v, buf))
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::INCORRECT_DATA, "Expected a number field for key '{}'", name_of_field.toString());
|
2016-09-20 19:11:25 +00:00
|
|
|
}
|
|
|
|
else if (*buf.position() == 'n') /// skip null
|
|
|
|
{
|
|
|
|
assertString("null", buf);
|
|
|
|
}
|
|
|
|
else if (*buf.position() == 't') /// skip true
|
|
|
|
{
|
|
|
|
assertString("true", buf);
|
|
|
|
}
|
2016-09-28 13:11:03 +00:00
|
|
|
else if (*buf.position() == 'f') /// skip false
|
2016-09-20 19:11:25 +00:00
|
|
|
{
|
|
|
|
assertString("false", buf);
|
|
|
|
}
|
2016-09-28 13:11:03 +00:00
|
|
|
else if (*buf.position() == '[')
|
|
|
|
{
|
|
|
|
++buf.position();
|
|
|
|
skipWhitespaceIfAny(buf);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-09-28 13:11:03 +00:00
|
|
|
if (!buf.eof() && *buf.position() == ']') /// skip empty array
|
|
|
|
{
|
|
|
|
++buf.position();
|
|
|
|
return;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-09-28 13:11:03 +00:00
|
|
|
while (true)
|
|
|
|
{
|
2018-08-27 14:54:58 +00:00
|
|
|
skipJSONField(buf, name_of_field);
|
2016-09-28 13:11:03 +00:00
|
|
|
skipWhitespaceIfAny(buf);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-09-28 13:11:03 +00:00
|
|
|
if (!buf.eof() && *buf.position() == ',')
|
|
|
|
{
|
|
|
|
++buf.position();
|
|
|
|
skipWhitespaceIfAny(buf);
|
|
|
|
}
|
|
|
|
else if (!buf.eof() && *buf.position() == ']')
|
|
|
|
{
|
|
|
|
++buf.position();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
else
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol for key '{}'", name_of_field.toString());
|
2016-09-28 13:11:03 +00:00
|
|
|
}
|
|
|
|
}
|
2018-08-23 11:00:04 +00:00
|
|
|
else if (*buf.position() == '{') /// skip whole object
|
2016-09-20 19:11:25 +00:00
|
|
|
{
|
2018-08-23 11:00:04 +00:00
|
|
|
++buf.position();
|
|
|
|
skipWhitespaceIfAny(buf);
|
|
|
|
|
|
|
|
while (!buf.eof() && *buf.position() != '}')
|
|
|
|
{
|
|
|
|
// field name
|
2018-08-27 14:54:58 +00:00
|
|
|
if (*buf.position() == '"')
|
|
|
|
{
|
2020-10-29 17:22:48 +00:00
|
|
|
NullOutput sink;
|
2018-08-23 11:00:04 +00:00
|
|
|
readJSONStringInto(sink, buf);
|
2018-08-27 19:06:32 +00:00
|
|
|
}
|
2018-08-27 14:54:58 +00:00
|
|
|
else
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol for key '{}'", name_of_field.toString());
|
2018-08-23 11:00:04 +00:00
|
|
|
|
|
|
|
// ':'
|
|
|
|
skipWhitespaceIfAny(buf);
|
|
|
|
if (buf.eof() || !(*buf.position() == ':'))
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol for key '{}'", name_of_field.toString());
|
2018-08-23 11:00:04 +00:00
|
|
|
++buf.position();
|
|
|
|
skipWhitespaceIfAny(buf);
|
|
|
|
|
2018-08-27 14:54:58 +00:00
|
|
|
skipJSONField(buf, name_of_field);
|
2018-08-23 11:00:04 +00:00
|
|
|
skipWhitespaceIfAny(buf);
|
|
|
|
|
|
|
|
// optional ','
|
2018-08-27 14:54:58 +00:00
|
|
|
if (!buf.eof() && *buf.position() == ',')
|
|
|
|
{
|
2018-08-23 11:00:04 +00:00
|
|
|
++buf.position();
|
|
|
|
skipWhitespaceIfAny(buf);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (buf.eof())
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF for key '{}'", name_of_field.toString());
|
2018-08-23 11:00:04 +00:00
|
|
|
++buf.position();
|
2016-09-20 19:11:25 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol '{}' for key '{}'",
|
|
|
|
std::string(*buf.position(), 1), name_of_field.toString());
|
2016-09-20 19:11:25 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-01-19 21:42:31 +00:00
|
|
|
Exception readException(ReadBuffer & buf, const String & additional_message, bool remote_exception)
|
2012-05-08 05:42:05 +00:00
|
|
|
{
|
|
|
|
int code = 0;
|
|
|
|
String name;
|
|
|
|
String message;
|
|
|
|
String stack_trace;
|
2020-01-02 06:56:53 +00:00
|
|
|
bool has_nested = false; /// Obsolete
|
2015-04-01 02:55:52 +00:00
|
|
|
|
2023-05-26 14:04:44 +00:00
|
|
|
readBinaryLittleEndian(code, buf);
|
2012-05-08 05:42:05 +00:00
|
|
|
readBinary(name, buf);
|
|
|
|
readBinary(message, buf);
|
|
|
|
readBinary(stack_trace, buf);
|
|
|
|
readBinary(has_nested, buf);
|
|
|
|
|
2017-07-31 21:39:24 +00:00
|
|
|
WriteBufferFromOwnString out;
|
2012-05-08 05:42:05 +00:00
|
|
|
|
2017-07-31 21:39:24 +00:00
|
|
|
if (!additional_message.empty())
|
|
|
|
out << additional_message << ". ";
|
2012-05-08 05:42:05 +00:00
|
|
|
|
2017-07-31 21:39:24 +00:00
|
|
|
if (name != "DB::Exception")
|
|
|
|
out << name << ". ";
|
2012-05-08 05:42:05 +00:00
|
|
|
|
2018-08-24 07:35:53 +00:00
|
|
|
out << message << ".";
|
|
|
|
|
|
|
|
if (!stack_trace.empty())
|
|
|
|
out << " Stack trace:\n\n" << stack_trace;
|
2012-05-08 05:42:05 +00:00
|
|
|
|
2023-01-23 13:16:14 +00:00
|
|
|
return Exception::createDeprecated(out.str(), code, remote_exception);
|
2012-05-08 05:42:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void readAndThrowException(ReadBuffer & buf, const String & additional_message)
|
|
|
|
{
|
2020-01-05 23:00:31 +00:00
|
|
|
readException(buf, additional_message).rethrow();
|
2012-05-08 05:42:05 +00:00
|
|
|
}
|
|
|
|
|
2016-08-16 21:23:53 +00:00
|
|
|
|
2021-02-19 12:51:26 +00:00
|
|
|
void skipToCarriageReturnOrEOF(ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
while (!buf.eof())
|
|
|
|
{
|
|
|
|
char * next_pos = find_first_symbols<'\r'>(buf.position(), buf.buffer().end());
|
|
|
|
buf.position() = next_pos;
|
|
|
|
|
|
|
|
if (!buf.hasPendingData())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (*buf.position() == '\r')
|
|
|
|
{
|
|
|
|
++buf.position();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-01-27 04:29:47 +00:00
|
|
|
void skipToNextLineOrEOF(ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
while (!buf.eof())
|
|
|
|
{
|
2018-08-25 15:32:55 +00:00
|
|
|
char * next_pos = find_first_symbols<'\n'>(buf.position(), buf.buffer().end());
|
|
|
|
buf.position() = next_pos;
|
2017-01-27 04:29:47 +00:00
|
|
|
|
|
|
|
if (!buf.hasPendingData())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (*buf.position() == '\n')
|
|
|
|
{
|
|
|
|
++buf.position();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void skipToUnescapedNextLineOrEOF(ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
while (!buf.eof())
|
|
|
|
{
|
2018-08-25 15:32:55 +00:00
|
|
|
char * next_pos = find_first_symbols<'\n', '\\'>(buf.position(), buf.buffer().end());
|
|
|
|
buf.position() = next_pos;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-01-27 04:29:47 +00:00
|
|
|
if (!buf.hasPendingData())
|
|
|
|
continue;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-01-27 04:29:47 +00:00
|
|
|
if (*buf.position() == '\n')
|
|
|
|
{
|
|
|
|
++buf.position();
|
|
|
|
return;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-01-27 04:29:47 +00:00
|
|
|
if (*buf.position() == '\\')
|
|
|
|
{
|
|
|
|
++buf.position();
|
|
|
|
if (buf.eof())
|
|
|
|
return;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2019-01-22 19:56:53 +00:00
|
|
|
/// Skip escaped character. We do not consider escape sequences with more than one character after backslash (\x01).
|
2017-01-27 04:29:47 +00:00
|
|
|
/// It's ok for the purpose of this function, because we are interested only in \n and \\.
|
|
|
|
++buf.position();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-11-10 20:15:14 +00:00
|
|
|
void skipNullTerminated(ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
while (!buf.eof())
|
|
|
|
{
|
|
|
|
char * next_pos = find_first_symbols<'\0'>(buf.position(), buf.buffer().end());
|
|
|
|
buf.position() = next_pos;
|
|
|
|
|
|
|
|
if (!buf.hasPendingData())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (*buf.position() == '\0')
|
|
|
|
{
|
|
|
|
++buf.position();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-10-16 08:28:10 +00:00
|
|
|
void saveUpToPosition(ReadBuffer & in, Memory<> & memory, char * current)
|
2019-11-22 13:53:26 +00:00
|
|
|
{
|
|
|
|
assert(current >= in.position());
|
|
|
|
assert(current <= in.buffer().end());
|
|
|
|
|
2021-02-10 14:16:27 +00:00
|
|
|
const size_t old_bytes = memory.size();
|
|
|
|
const size_t additional_bytes = current - in.position();
|
|
|
|
const size_t new_bytes = old_bytes + additional_bytes;
|
2021-07-14 11:59:06 +00:00
|
|
|
|
2019-11-22 13:53:26 +00:00
|
|
|
/// There are no new bytes to add to memory.
|
|
|
|
/// No need to do extra stuff.
|
|
|
|
if (new_bytes == 0)
|
|
|
|
return;
|
2021-07-14 11:59:06 +00:00
|
|
|
|
|
|
|
assert(in.position() + additional_bytes <= in.buffer().end());
|
2019-11-22 13:53:26 +00:00
|
|
|
memory.resize(new_bytes);
|
|
|
|
memcpy(memory.data() + old_bytes, in.position(), additional_bytes);
|
|
|
|
in.position() = current;
|
|
|
|
}
|
|
|
|
|
2021-10-16 08:28:10 +00:00
|
|
|
bool loadAtPosition(ReadBuffer & in, Memory<> & memory, char * & current)
|
2019-11-22 13:53:26 +00:00
|
|
|
{
|
|
|
|
assert(current <= in.buffer().end());
|
|
|
|
|
|
|
|
if (current < in.buffer().end())
|
|
|
|
return true;
|
|
|
|
|
|
|
|
saveUpToPosition(in, memory, current);
|
2020-07-31 14:53:41 +00:00
|
|
|
|
2019-11-22 13:53:26 +00:00
|
|
|
bool loaded_more = !in.eof();
|
2020-07-31 14:53:41 +00:00
|
|
|
// A sanity check. Buffer position may be in the beginning of the buffer
|
|
|
|
// (normal case), or have some offset from it (AIO).
|
|
|
|
assert(in.position() >= in.buffer().begin());
|
|
|
|
assert(in.position() <= in.buffer().end());
|
2019-11-22 13:53:26 +00:00
|
|
|
current = in.position();
|
2020-07-31 14:53:41 +00:00
|
|
|
|
2019-11-22 13:53:26 +00:00
|
|
|
return loaded_more;
|
|
|
|
}
|
|
|
|
|
2021-11-09 13:14:07 +00:00
|
|
|
/// Searches for delimiter in input stream and sets buffer position after delimiter (if found) or EOF (if not)
|
|
|
|
static void findAndSkipNextDelimiter(PeekableReadBuffer & buf, const String & delimiter)
|
|
|
|
{
|
|
|
|
if (delimiter.empty())
|
|
|
|
return;
|
|
|
|
|
|
|
|
while (!buf.eof())
|
|
|
|
{
|
|
|
|
void * pos = memchr(buf.position(), delimiter[0], buf.available());
|
|
|
|
if (!pos)
|
|
|
|
{
|
|
|
|
buf.position() += buf.available();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
buf.position() = static_cast<ReadBuffer::Position>(pos);
|
|
|
|
|
|
|
|
PeekableReadBufferCheckpoint checkpoint{buf};
|
|
|
|
if (checkString(delimiter, buf))
|
|
|
|
return;
|
|
|
|
|
|
|
|
buf.rollbackToCheckpoint();
|
|
|
|
++buf.position();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void skipToNextRowOrEof(PeekableReadBuffer & buf, const String & row_after_delimiter, const String & row_between_delimiter, bool skip_spaces)
|
|
|
|
{
|
|
|
|
if (row_after_delimiter.empty())
|
|
|
|
{
|
|
|
|
findAndSkipNextDelimiter(buf, row_between_delimiter);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
while (true)
|
|
|
|
{
|
|
|
|
findAndSkipNextDelimiter(buf, row_after_delimiter);
|
|
|
|
|
|
|
|
if (skip_spaces)
|
|
|
|
skipWhitespaceIfAny(buf);
|
|
|
|
|
2023-11-20 16:47:52 +00:00
|
|
|
if (buf.eof() || checkString(row_between_delimiter, buf))
|
2021-11-09 13:14:07 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-12-15 11:30:57 +00:00
|
|
|
// Use PeekableReadBuffer to copy field to string after parsing.
|
2022-05-13 13:51:28 +00:00
|
|
|
template <typename Vector, typename ParseFunc>
|
|
|
|
static void readParsedValueInto(Vector & s, ReadBuffer & buf, ParseFunc parse_func)
|
2021-12-15 11:30:57 +00:00
|
|
|
{
|
|
|
|
PeekableReadBuffer peekable_buf(buf);
|
|
|
|
peekable_buf.setCheckpoint();
|
|
|
|
parse_func(peekable_buf);
|
|
|
|
peekable_buf.makeContinuousMemoryFromCheckpointToPos();
|
|
|
|
auto * end = peekable_buf.position();
|
|
|
|
peekable_buf.rollbackToCheckpoint();
|
|
|
|
s.append(peekable_buf.position(), end);
|
|
|
|
peekable_buf.position() = end;
|
|
|
|
}
|
2021-12-03 13:25:35 +00:00
|
|
|
|
2022-12-07 21:19:27 +00:00
|
|
|
template <typename Vector>
|
|
|
|
static void readQuotedStringFieldInto(Vector & s, ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
assertChar('\'', buf);
|
|
|
|
s.push_back('\'');
|
|
|
|
while (!buf.eof())
|
|
|
|
{
|
|
|
|
char * next_pos = find_first_symbols<'\\', '\''>(buf.position(), buf.buffer().end());
|
|
|
|
|
|
|
|
s.append(buf.position(), next_pos);
|
|
|
|
buf.position() = next_pos;
|
|
|
|
|
|
|
|
if (!buf.hasPendingData())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (*buf.position() == '\'')
|
|
|
|
break;
|
|
|
|
|
|
|
|
s.push_back(*buf.position());
|
|
|
|
if (*buf.position() == '\\')
|
|
|
|
{
|
|
|
|
++buf.position();
|
|
|
|
if (!buf.eof())
|
|
|
|
{
|
|
|
|
s.push_back(*buf.position());
|
|
|
|
++buf.position();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2022-12-14 21:17:00 +00:00
|
|
|
|
|
|
|
if (buf.eof())
|
|
|
|
return;
|
|
|
|
|
2022-12-07 21:19:27 +00:00
|
|
|
++buf.position();
|
|
|
|
s.push_back('\'');
|
|
|
|
}
|
|
|
|
|
2022-05-13 13:51:28 +00:00
|
|
|
template <char opening_bracket, char closing_bracket, typename Vector>
|
|
|
|
static void readQuotedFieldInBracketsInto(Vector & s, ReadBuffer & buf)
|
2021-12-03 13:25:35 +00:00
|
|
|
{
|
|
|
|
assertChar(opening_bracket, buf);
|
|
|
|
s.push_back(opening_bracket);
|
|
|
|
|
|
|
|
size_t balance = 1;
|
|
|
|
|
|
|
|
while (!buf.eof() && balance)
|
|
|
|
{
|
|
|
|
char * next_pos = find_first_symbols<'\'', opening_bracket, closing_bracket>(buf.position(), buf.buffer().end());
|
|
|
|
appendToStringOrVector(s, buf, next_pos);
|
|
|
|
buf.position() = next_pos;
|
|
|
|
|
|
|
|
if (!buf.hasPendingData())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (*buf.position() == '\'')
|
|
|
|
{
|
2022-12-07 21:19:27 +00:00
|
|
|
readQuotedStringFieldInto(s, buf);
|
2021-12-03 13:25:35 +00:00
|
|
|
}
|
|
|
|
else if (*buf.position() == opening_bracket)
|
|
|
|
{
|
2022-12-07 21:19:27 +00:00
|
|
|
s.push_back(opening_bracket);
|
2021-12-03 13:25:35 +00:00
|
|
|
++balance;
|
|
|
|
++buf.position();
|
|
|
|
}
|
|
|
|
else if (*buf.position() == closing_bracket)
|
|
|
|
{
|
2022-12-07 21:19:27 +00:00
|
|
|
s.push_back(closing_bracket);
|
2021-12-03 13:25:35 +00:00
|
|
|
--balance;
|
|
|
|
++buf.position();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-05-13 13:51:28 +00:00
|
|
|
template <typename Vector>
|
|
|
|
void readQuotedFieldInto(Vector & s, ReadBuffer & buf)
|
2021-12-03 13:25:35 +00:00
|
|
|
{
|
|
|
|
if (buf.eof())
|
|
|
|
return;
|
|
|
|
|
|
|
|
/// Possible values in 'Quoted' field:
|
|
|
|
/// - Strings: '...'
|
|
|
|
/// - Arrays: [...]
|
|
|
|
/// - Tuples: (...)
|
|
|
|
/// - Maps: {...}
|
|
|
|
/// - NULL
|
2022-03-24 12:54:12 +00:00
|
|
|
/// - Bool: true/false
|
2021-12-03 13:25:35 +00:00
|
|
|
/// - Number: integer, float, decimal.
|
|
|
|
|
|
|
|
if (*buf.position() == '\'')
|
2022-12-07 21:19:27 +00:00
|
|
|
readQuotedStringFieldInto(s, buf);
|
2021-12-03 13:25:35 +00:00
|
|
|
else if (*buf.position() == '[')
|
2022-05-13 13:51:28 +00:00
|
|
|
readQuotedFieldInBracketsInto<'[', ']'>(s, buf);
|
2021-12-03 13:25:35 +00:00
|
|
|
else if (*buf.position() == '(')
|
2022-05-13 13:51:28 +00:00
|
|
|
readQuotedFieldInBracketsInto<'(', ')'>(s, buf);
|
2021-12-03 13:25:35 +00:00
|
|
|
else if (*buf.position() == '{')
|
2022-05-13 13:51:28 +00:00
|
|
|
readQuotedFieldInBracketsInto<'{', '}'>(s, buf);
|
2021-12-03 13:25:35 +00:00
|
|
|
else if (checkCharCaseInsensitive('n', buf))
|
|
|
|
{
|
|
|
|
/// NULL or NaN
|
|
|
|
if (checkCharCaseInsensitive('u', buf))
|
|
|
|
{
|
|
|
|
assertStringCaseInsensitive("ll", buf);
|
|
|
|
s.append("NULL");
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
assertStringCaseInsensitive("an", buf);
|
|
|
|
s.append("NaN");
|
|
|
|
}
|
|
|
|
}
|
2022-03-24 12:54:12 +00:00
|
|
|
else if (checkCharCaseInsensitive('t', buf))
|
|
|
|
{
|
|
|
|
assertStringCaseInsensitive("rue", buf);
|
|
|
|
s.append("true");
|
|
|
|
}
|
|
|
|
else if (checkCharCaseInsensitive('f', buf))
|
|
|
|
{
|
|
|
|
assertStringCaseInsensitive("alse", buf);
|
|
|
|
s.append("false");
|
|
|
|
}
|
2021-12-03 13:25:35 +00:00
|
|
|
else
|
|
|
|
{
|
|
|
|
/// It's an integer, float or decimal. They all can be parsed as float.
|
2021-12-15 11:30:57 +00:00
|
|
|
auto parse_func = [](ReadBuffer & in)
|
|
|
|
{
|
|
|
|
Float64 tmp;
|
|
|
|
readFloatText(tmp, in);
|
|
|
|
};
|
2022-05-13 13:51:28 +00:00
|
|
|
readParsedValueInto(s, buf, parse_func);
|
2021-12-03 13:25:35 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-05-13 15:08:02 +00:00
|
|
|
template void readQuotedFieldInto<NullOutput>(NullOutput & s, ReadBuffer & buf);
|
|
|
|
|
2022-05-13 13:51:28 +00:00
|
|
|
void readQuotedField(String & s, ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
s.clear();
|
|
|
|
readQuotedFieldInto(s, buf);
|
|
|
|
}
|
|
|
|
|
2022-05-06 16:48:48 +00:00
|
|
|
void readJSONField(String & s, ReadBuffer & buf)
|
2021-12-15 11:30:57 +00:00
|
|
|
{
|
2022-05-06 16:48:48 +00:00
|
|
|
s.clear();
|
2021-12-15 11:30:57 +00:00
|
|
|
auto parse_func = [](ReadBuffer & in) { skipJSONField(in, "json_field"); };
|
2022-05-13 13:51:28 +00:00
|
|
|
readParsedValueInto(s, buf, parse_func);
|
2021-12-15 11:30:57 +00:00
|
|
|
}
|
2021-12-03 13:25:35 +00:00
|
|
|
|
2023-01-06 20:46:43 +00:00
|
|
|
void readTSVField(String & s, ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
s.clear();
|
|
|
|
readEscapedStringIntoImpl<String, false>(s, buf);
|
|
|
|
}
|
|
|
|
|
2010-06-04 18:25:25 +00:00
|
|
|
}
|