2017-04-01 09:19:00 +00:00
# include <Core/Defines.h>
2020-06-23 03:14:16 +00:00
# include <Common/hex.h>
2017-04-01 09:19:00 +00:00
# include <Common/PODArray.h>
2018-01-15 19:07:47 +00:00
# include <Common/StringUtils/StringUtils.h>
2019-02-10 17:40:52 +00:00
# include <Common/memcpySmall.h>
2018-06-29 07:34:12 +00:00
# include <Formats/FormatSettings.h>
2017-04-16 05:40:17 +00:00
# include <IO/WriteBufferFromString.h>
2021-10-16 08:28:10 +00:00
# include <IO/BufferWithOwnMemory.h>
2018-01-13 04:45:13 +00:00
# include <IO/readFloatText.h>
2017-04-16 05:40:17 +00:00
# include <IO/Operators.h>
2021-10-02 07:13:14 +00:00
# include <base/find_symbols.h>
2022-05-08 17:01:47 +00:00
# include <cstdlib>
2022-07-31 14:34:05 +00:00
# include <bit>
2017-04-16 05:40:17 +00:00
2019-01-04 12:10:00 +00:00
# ifdef __SSE2__
Enable sse2 for CSV parsing.
Testing data
```
select 'aaaaaaaa,bbbbbbbb,cccccccc,dddddddd,eeeeeeee,ffffffff,gggg,hhh' from numbers(3000000) into outfile '/tmp/test.csv'
```
Testing command
```
echo "select count() from file('/tmp/test.csv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
Before
```
QPS: 1.317, RPS: 3949749.687, MiB/s: 478.380, result RPS: 1.317, result MiB/s: 0.000.
0.000% 0.704 sec.
10.000% 0.712 sec.
20.000% 0.718 sec.
30.000% 0.726 sec.
40.000% 0.739 sec.
50.000% 0.754 sec.
60.000% 0.770 sec.
70.000% 0.788 sec.
80.000% 0.798 sec.
90.000% 0.815 sec.
95.000% 0.826 sec.
99.000% 0.850 sec.
99.900% 0.857 sec.
99.990% 0.858 sec.
```
After
```
QPS: 1.533, RPS: 4598308.336, MiB/s: 556.932, result RPS: 1.533, result MiB/s: 0.000.
0.000% 0.626 sec.
10.000% 0.635 sec.
20.000% 0.639 sec.
30.000% 0.642 sec.
40.000% 0.643 sec.
50.000% 0.645 sec.
60.000% 0.649 sec.
70.000% 0.652 sec.
80.000% 0.658 sec.
90.000% 0.682 sec.
95.000% 0.710 sec.
99.000% 0.727 sec.
99.900% 0.733 sec.
99.990% 0.734 sec.
```
2018-08-28 11:03:52 +00:00
# include <emmintrin.h>
# endif
2022-06-15 13:19:29 +00:00
# if defined(__aarch64__) && defined(__ARM_NEON)
# include <arm_neon.h>
# ifdef HAS_RESERVED_IDENTIFIER
# pragma clang diagnostic ignored "-Wreserved-identifier"
# endif
# endif
2010-06-04 18:25:25 +00:00
namespace DB
{
2016-01-11 21:46:36 +00:00
namespace ErrorCodes
{
2017-04-01 07:20:54 +00:00
extern const int CANNOT_PARSE_INPUT_ASSERTION_FAILED ;
extern const int CANNOT_PARSE_ESCAPE_SEQUENCE ;
extern const int CANNOT_PARSE_QUOTED_STRING ;
2018-07-11 21:43:09 +00:00
extern const int CANNOT_PARSE_DATETIME ;
extern const int CANNOT_PARSE_DATE ;
2017-04-01 07:20:54 +00:00
extern const int INCORRECT_DATA ;
2022-01-10 17:07:01 +00:00
extern const int ATTEMPT_TO_READ_AFTER_EOF ;
2022-11-17 19:06:56 +00:00
extern const int LOGICAL_ERROR ;
2022-12-08 20:46:22 +00:00
extern const int BAD_ARGUMENTS ;
2016-01-11 21:46:36 +00:00
}
2022-12-30 14:16:20 +00:00
template < size_t num_bytes , typename IteratorSrc , typename IteratorDst >
inline void parseHex ( IteratorSrc src , IteratorDst dst )
2020-06-23 03:14:16 +00:00
{
size_t src_pos = 0 ;
size_t dst_pos = 0 ;
for ( ; dst_pos < num_bytes ; + + dst_pos , src_pos + = 2 )
dst [ dst_pos ] = unhex2 ( reinterpret_cast < const char * > ( & src [ src_pos ] ) ) ;
}
void parseUUID ( const UInt8 * src36 , UInt8 * dst16 )
{
/// If string is not like UUID - implementation specific behaviour.
2022-12-30 14:16:20 +00:00
parseHex < 4 > ( & src36 [ 0 ] , & dst16 [ 0 ] ) ;
parseHex < 2 > ( & src36 [ 9 ] , & dst16 [ 4 ] ) ;
parseHex < 2 > ( & src36 [ 14 ] , & dst16 [ 6 ] ) ;
parseHex < 2 > ( & src36 [ 19 ] , & dst16 [ 8 ] ) ;
parseHex < 6 > ( & src36 [ 24 ] , & dst16 [ 10 ] ) ;
2020-06-23 03:14:16 +00:00
}
void parseUUIDWithoutSeparator ( const UInt8 * src36 , UInt8 * dst16 )
{
/// If string is not like UUID - implementation specific behaviour.
2022-12-30 14:16:20 +00:00
parseHex < 16 > ( & src36 [ 0 ] , & dst16 [ 0 ] ) ;
2020-06-23 03:14:16 +00:00
}
/** Function used when byte ordering is important when parsing uuid
* ex : When we create an UUID type
*/
void parseUUID ( const UInt8 * src36 , std : : reverse_iterator < UInt8 * > dst16 )
{
/// If string is not like UUID - implementation specific behaviour.
/// FIXME This code looks like trash.
2022-12-30 14:16:20 +00:00
parseHex < 4 > ( & src36 [ 0 ] , dst16 + 8 ) ;
parseHex < 2 > ( & src36 [ 9 ] , dst16 + 12 ) ;
parseHex < 2 > ( & src36 [ 14 ] , dst16 + 14 ) ;
parseHex < 2 > ( & src36 [ 19 ] , dst16 ) ;
parseHex < 6 > ( & src36 [ 24 ] , dst16 + 2 ) ;
2020-06-23 03:14:16 +00:00
}
/** Function used when byte ordering is important when parsing uuid
* ex : When we create an UUID type
*/
void parseUUIDWithoutSeparator ( const UInt8 * src36 , std : : reverse_iterator < UInt8 * > dst16 )
{
/// If string is not like UUID - implementation specific behaviour.
2022-12-30 14:16:20 +00:00
parseHex < 8 > ( & src36 [ 0 ] , dst16 + 8 ) ;
parseHex < 8 > ( & src36 [ 16 ] , dst16 ) ;
2020-06-23 03:14:16 +00:00
}
2018-10-09 21:32:15 +00:00
void NO_INLINE throwAtAssertionFailed ( const char * s , ReadBuffer & buf )
2012-09-24 05:40:45 +00:00
{
2017-07-31 21:39:24 +00:00
WriteBufferFromOwnString out ;
2020-04-07 14:15:14 +00:00
out < < " Cannot parse input: expected " < < quote < < s ;
2012-09-24 05:40:45 +00:00
2017-07-31 21:39:24 +00:00
if ( buf . eof ( ) )
out < < " at end of stream. " ;
else
2020-04-07 14:15:14 +00:00
out < < " before: " < < quote < < String ( buf . position ( ) , std : : min ( SHOW_CHARS_ON_SYNTAX_ERROR , buf . buffer ( ) . end ( ) - buf . position ( ) ) ) ;
2012-09-24 05:40:45 +00:00
2020-12-10 17:26:36 +00:00
throw ParsingException ( out . str ( ) , ErrorCodes : : CANNOT_PARSE_INPUT_ASSERTION_FAILED ) ;
2012-09-24 05:40:45 +00:00
}
2015-10-05 14:20:56 +00:00
bool checkString ( const char * s , ReadBuffer & buf )
2010-06-04 18:25:25 +00:00
{
2017-04-01 07:20:54 +00:00
for ( ; * s ; + + s )
{
if ( buf . eof ( ) | | * buf . position ( ) ! = * s )
return false ;
+ + buf . position ( ) ;
}
return true ;
2015-10-05 14:20:56 +00:00
}
2016-08-16 21:23:53 +00:00
2016-12-30 05:13:14 +00:00
bool checkStringCaseInsensitive ( const char * s , ReadBuffer & buf )
2016-08-16 21:23:53 +00:00
{
2017-04-01 07:20:54 +00:00
for ( ; * s ; + + s )
{
if ( buf . eof ( ) )
return false ;
2016-08-16 21:23:53 +00:00
2017-04-01 07:20:54 +00:00
char c = * buf . position ( ) ;
if ( ! equalsCaseInsensitive ( * s , c ) )
return false ;
2016-08-16 21:23:53 +00:00
2017-04-01 07:20:54 +00:00
+ + buf . position ( ) ;
}
return true ;
2016-08-16 21:23:53 +00:00
}
2015-10-05 14:20:56 +00:00
void assertString ( const char * s , ReadBuffer & buf )
{
2017-04-01 07:20:54 +00:00
if ( ! checkString ( s , buf ) )
throwAtAssertionFailed ( s , buf ) ;
2010-06-04 18:25:25 +00:00
}
2015-06-03 15:32:06 +00:00
2014-03-27 11:29:40 +00:00
void assertEOF ( ReadBuffer & buf )
{
2017-04-01 07:20:54 +00:00
if ( ! buf . eof ( ) )
throwAtAssertionFailed ( " eof " , buf ) ;
2014-03-27 11:29:40 +00:00
}
2021-12-30 13:15:28 +00:00
void assertNotEOF ( ReadBuffer & buf )
{
if ( buf . eof ( ) )
2022-03-16 19:16:26 +00:00
throw Exception ( " Attempt to read after EOF " , ErrorCodes : : ATTEMPT_TO_READ_AFTER_EOF ) ;
2021-12-30 13:15:28 +00:00
}
2016-02-16 16:39:39 +00:00
2016-12-30 05:13:14 +00:00
void assertStringCaseInsensitive ( const char * s , ReadBuffer & buf )
{
2017-04-01 07:20:54 +00:00
if ( ! checkStringCaseInsensitive ( s , buf ) )
throwAtAssertionFailed ( s , buf ) ;
2016-12-30 05:13:14 +00:00
}
bool checkStringByFirstCharacterAndAssertTheRest ( const char * s , ReadBuffer & buf )
{
2017-04-01 07:20:54 +00:00
if ( buf . eof ( ) | | * buf . position ( ) ! = * s )
return false ;
2016-12-30 05:13:14 +00:00
2017-04-01 07:20:54 +00:00
assertString ( s , buf ) ;
return true ;
2016-12-30 05:13:14 +00:00
}
bool checkStringByFirstCharacterAndAssertTheRestCaseInsensitive ( const char * s , ReadBuffer & buf )
{
2017-04-01 07:20:54 +00:00
if ( buf . eof ( ) )
return false ;
2016-12-30 05:13:14 +00:00
2017-04-01 07:20:54 +00:00
char c = * buf . position ( ) ;
if ( ! equalsCaseInsensitive ( * s , c ) )
return false ;
2016-12-30 05:13:14 +00:00
2017-04-01 07:20:54 +00:00
assertStringCaseInsensitive ( s , buf ) ;
return true ;
2016-12-30 05:13:14 +00:00
}
2016-02-16 16:39:39 +00:00
template < typename T >
Padding for IO buffers.
Testing data
```
select 'aaaaaaaa','bbbbbbbb','cccccccc','dddddddd','eeeeeeee','ffffffff','gggg','hhh' from numbers(3000000) into outfile '/tmp/test.tsv'
```
Testing command
```
echo "select count() from file('/tmp/test.tsv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
TSV parser has less overhead than CSV, using it would better unveil the benefits of memcpySmall.
Before
```
QPS: 1.662, RPS: 4985463.906, MiB/s: 603.823, result RPS: 1.662, result MiB/s: 0.000.
0.000% 0.559 sec.
10.000% 0.564 sec.
20.000% 0.568 sec.
30.000% 0.572 sec.
40.000% 0.575 sec.
50.000% 0.581 sec.
60.000% 0.592 sec.
70.000% 0.624 sec.
80.000% 0.639 sec.
90.000% 0.664 sec.
95.000% 0.686 sec.
99.000% 0.711 sec.
99.900% 0.715 sec.
99.990% 0.716 sec.
```
After
```
QPS: 1.861, RPS: 5582303.107, MiB/s: 676.110, result RPS: 1.861, result MiB/s: 0.000.
0.000% 0.510 sec.
10.000% 0.514 sec.
20.000% 0.517 sec.
30.000% 0.521 sec.
40.000% 0.523 sec.
50.000% 0.527 sec.
60.000% 0.530 sec.
70.000% 0.539 sec.
80.000% 0.558 sec.
90.000% 0.584 sec.
95.000% 0.589 sec.
99.000% 0.608 sec.
99.900% 0.655 sec.
99.990% 0.663 sec.
```
2018-08-27 19:14:15 +00:00
static void appendToStringOrVector ( T & s , ReadBuffer & rb , const char * end )
2016-02-16 16:39:39 +00:00
{
Padding for IO buffers.
Testing data
```
select 'aaaaaaaa','bbbbbbbb','cccccccc','dddddddd','eeeeeeee','ffffffff','gggg','hhh' from numbers(3000000) into outfile '/tmp/test.tsv'
```
Testing command
```
echo "select count() from file('/tmp/test.tsv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
TSV parser has less overhead than CSV, using it would better unveil the benefits of memcpySmall.
Before
```
QPS: 1.662, RPS: 4985463.906, MiB/s: 603.823, result RPS: 1.662, result MiB/s: 0.000.
0.000% 0.559 sec.
10.000% 0.564 sec.
20.000% 0.568 sec.
30.000% 0.572 sec.
40.000% 0.575 sec.
50.000% 0.581 sec.
60.000% 0.592 sec.
70.000% 0.624 sec.
80.000% 0.639 sec.
90.000% 0.664 sec.
95.000% 0.686 sec.
99.000% 0.711 sec.
99.900% 0.715 sec.
99.990% 0.716 sec.
```
After
```
QPS: 1.861, RPS: 5582303.107, MiB/s: 676.110, result RPS: 1.861, result MiB/s: 0.000.
0.000% 0.510 sec.
10.000% 0.514 sec.
20.000% 0.517 sec.
30.000% 0.521 sec.
40.000% 0.523 sec.
50.000% 0.527 sec.
60.000% 0.530 sec.
70.000% 0.539 sec.
80.000% 0.558 sec.
90.000% 0.584 sec.
95.000% 0.589 sec.
99.000% 0.608 sec.
99.900% 0.655 sec.
99.990% 0.663 sec.
```
2018-08-27 19:14:15 +00:00
s . append ( rb . position ( ) , end - rb . position ( ) ) ;
2016-02-16 16:39:39 +00:00
}
template < >
Padding for IO buffers.
Testing data
```
select 'aaaaaaaa','bbbbbbbb','cccccccc','dddddddd','eeeeeeee','ffffffff','gggg','hhh' from numbers(3000000) into outfile '/tmp/test.tsv'
```
Testing command
```
echo "select count() from file('/tmp/test.tsv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
TSV parser has less overhead than CSV, using it would better unveil the benefits of memcpySmall.
Before
```
QPS: 1.662, RPS: 4985463.906, MiB/s: 603.823, result RPS: 1.662, result MiB/s: 0.000.
0.000% 0.559 sec.
10.000% 0.564 sec.
20.000% 0.568 sec.
30.000% 0.572 sec.
40.000% 0.575 sec.
50.000% 0.581 sec.
60.000% 0.592 sec.
70.000% 0.624 sec.
80.000% 0.639 sec.
90.000% 0.664 sec.
95.000% 0.686 sec.
99.000% 0.711 sec.
99.900% 0.715 sec.
99.990% 0.716 sec.
```
After
```
QPS: 1.861, RPS: 5582303.107, MiB/s: 676.110, result RPS: 1.861, result MiB/s: 0.000.
0.000% 0.510 sec.
10.000% 0.514 sec.
20.000% 0.517 sec.
30.000% 0.521 sec.
40.000% 0.523 sec.
50.000% 0.527 sec.
60.000% 0.530 sec.
70.000% 0.539 sec.
80.000% 0.558 sec.
90.000% 0.584 sec.
95.000% 0.589 sec.
99.000% 0.608 sec.
99.900% 0.655 sec.
99.990% 0.663 sec.
```
2018-08-27 19:14:15 +00:00
inline void appendToStringOrVector ( PaddedPODArray < UInt8 > & s , ReadBuffer & rb , const char * end )
2016-02-16 16:39:39 +00:00
{
Padding for IO buffers.
Testing data
```
select 'aaaaaaaa','bbbbbbbb','cccccccc','dddddddd','eeeeeeee','ffffffff','gggg','hhh' from numbers(3000000) into outfile '/tmp/test.tsv'
```
Testing command
```
echo "select count() from file('/tmp/test.tsv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
TSV parser has less overhead than CSV, using it would better unveil the benefits of memcpySmall.
Before
```
QPS: 1.662, RPS: 4985463.906, MiB/s: 603.823, result RPS: 1.662, result MiB/s: 0.000.
0.000% 0.559 sec.
10.000% 0.564 sec.
20.000% 0.568 sec.
30.000% 0.572 sec.
40.000% 0.575 sec.
50.000% 0.581 sec.
60.000% 0.592 sec.
70.000% 0.624 sec.
80.000% 0.639 sec.
90.000% 0.664 sec.
95.000% 0.686 sec.
99.000% 0.711 sec.
99.900% 0.715 sec.
99.990% 0.716 sec.
```
After
```
QPS: 1.861, RPS: 5582303.107, MiB/s: 676.110, result RPS: 1.861, result MiB/s: 0.000.
0.000% 0.510 sec.
10.000% 0.514 sec.
20.000% 0.517 sec.
30.000% 0.521 sec.
40.000% 0.523 sec.
50.000% 0.527 sec.
60.000% 0.530 sec.
70.000% 0.539 sec.
80.000% 0.558 sec.
90.000% 0.584 sec.
95.000% 0.589 sec.
99.000% 0.608 sec.
99.900% 0.655 sec.
99.990% 0.663 sec.
```
2018-08-27 19:14:15 +00:00
if ( rb . isPadded ( ) )
s . insertSmallAllowReadWriteOverflow15 ( rb . position ( ) , end ) ;
else
s . insert ( rb . position ( ) , end ) ;
2016-02-16 16:39:39 +00:00
}
2019-07-01 05:58:31 +00:00
template < >
inline void appendToStringOrVector ( PODArray < char > & s , ReadBuffer & rb , const char * end )
{
s . insert ( rb . position ( ) , end ) ;
}
2020-12-02 21:05:51 +00:00
template < char . . . chars , typename Vector >
void readStringUntilCharsInto ( Vector & s , ReadBuffer & buf )
2010-06-04 18:25:25 +00:00
{
2017-04-01 07:20:54 +00:00
while ( ! buf . eof ( ) )
{
2020-12-02 21:05:51 +00:00
char * next_pos = find_first_symbols < chars . . . > ( buf . position ( ) , buf . buffer ( ) . end ( ) ) ;
2010-06-04 18:25:25 +00:00
Padding for IO buffers.
Testing data
```
select 'aaaaaaaa','bbbbbbbb','cccccccc','dddddddd','eeeeeeee','ffffffff','gggg','hhh' from numbers(3000000) into outfile '/tmp/test.tsv'
```
Testing command
```
echo "select count() from file('/tmp/test.tsv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
TSV parser has less overhead than CSV, using it would better unveil the benefits of memcpySmall.
Before
```
QPS: 1.662, RPS: 4985463.906, MiB/s: 603.823, result RPS: 1.662, result MiB/s: 0.000.
0.000% 0.559 sec.
10.000% 0.564 sec.
20.000% 0.568 sec.
30.000% 0.572 sec.
40.000% 0.575 sec.
50.000% 0.581 sec.
60.000% 0.592 sec.
70.000% 0.624 sec.
80.000% 0.639 sec.
90.000% 0.664 sec.
95.000% 0.686 sec.
99.000% 0.711 sec.
99.900% 0.715 sec.
99.990% 0.716 sec.
```
After
```
QPS: 1.861, RPS: 5582303.107, MiB/s: 676.110, result RPS: 1.861, result MiB/s: 0.000.
0.000% 0.510 sec.
10.000% 0.514 sec.
20.000% 0.517 sec.
30.000% 0.521 sec.
40.000% 0.523 sec.
50.000% 0.527 sec.
60.000% 0.530 sec.
70.000% 0.539 sec.
80.000% 0.558 sec.
90.000% 0.584 sec.
95.000% 0.589 sec.
99.000% 0.608 sec.
99.900% 0.655 sec.
99.990% 0.663 sec.
```
2018-08-27 19:14:15 +00:00
appendToStringOrVector ( s , buf , next_pos ) ;
2018-08-25 15:32:55 +00:00
buf . position ( ) = next_pos ;
2010-06-04 18:25:25 +00:00
2017-04-01 07:20:54 +00:00
if ( buf . hasPendingData ( ) )
return ;
}
2010-06-04 18:25:25 +00:00
}
2019-07-01 05:58:31 +00:00
template < typename Vector >
2020-12-02 21:05:51 +00:00
void readStringInto ( Vector & s , ReadBuffer & buf )
2019-07-01 05:58:31 +00:00
{
2020-12-02 21:05:51 +00:00
readStringUntilCharsInto < ' \t ' , ' \n ' > ( s , buf ) ;
}
2019-07-01 05:58:31 +00:00
2020-12-02 21:05:51 +00:00
template < typename Vector >
void readStringUntilWhitespaceInto ( Vector & s , ReadBuffer & buf )
{
readStringUntilCharsInto < ' ' > ( s , buf ) ;
}
2019-07-01 05:58:31 +00:00
2022-02-03 21:07:31 +00:00
template < typename Vector >
void readStringUntilNewlineInto ( Vector & s , ReadBuffer & buf )
{
readStringUntilCharsInto < ' \n ' > ( s , buf ) ;
}
2022-02-03 22:44:43 +00:00
template void readStringUntilNewlineInto < PaddedPODArray < UInt8 > > ( PaddedPODArray < UInt8 > & s , ReadBuffer & buf ) ;
2022-02-03 21:25:36 +00:00
template void readStringUntilNewlineInto < String > ( String & s , ReadBuffer & buf ) ;
2020-12-02 21:05:51 +00:00
template < typename Vector >
void readNullTerminated ( Vector & s , ReadBuffer & buf )
{
readStringUntilCharsInto < ' \0 ' > ( s , buf ) ;
2019-07-01 05:58:31 +00:00
buf . ignore ( ) ;
}
2020-12-02 21:05:51 +00:00
void readStringUntilWhitespace ( String & s , ReadBuffer & buf )
{
s . clear ( ) ;
readStringUntilWhitespaceInto ( s , buf ) ;
}
2019-07-01 05:58:31 +00:00
template void readNullTerminated < PODArray < char > > ( PODArray < char > & s , ReadBuffer & buf ) ;
template void readNullTerminated < String > ( String & s , ReadBuffer & buf ) ;
2016-02-16 16:39:39 +00:00
void readString ( String & s , ReadBuffer & buf )
2015-09-08 14:24:25 +00:00
{
2017-04-01 07:20:54 +00:00
s . clear ( ) ;
readStringInto ( s , buf ) ;
2016-02-16 16:39:39 +00:00
}
2016-04-15 00:33:21 +00:00
template void readStringInto < PaddedPODArray < UInt8 > > ( PaddedPODArray < UInt8 > & s , ReadBuffer & buf ) ;
2021-05-07 02:19:54 +00:00
template void readStringInto < String > ( String & s , ReadBuffer & buf ) ;
2022-05-13 15:08:02 +00:00
template void readStringInto < NullOutput > ( NullOutput & s , ReadBuffer & buf ) ;
2016-02-16 16:39:39 +00:00
template < typename Vector >
void readStringUntilEOFInto ( Vector & s , ReadBuffer & buf )
{
2017-04-01 07:20:54 +00:00
while ( ! buf . eof ( ) )
{
Padding for IO buffers.
Testing data
```
select 'aaaaaaaa','bbbbbbbb','cccccccc','dddddddd','eeeeeeee','ffffffff','gggg','hhh' from numbers(3000000) into outfile '/tmp/test.tsv'
```
Testing command
```
echo "select count() from file('/tmp/test.tsv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
TSV parser has less overhead than CSV, using it would better unveil the benefits of memcpySmall.
Before
```
QPS: 1.662, RPS: 4985463.906, MiB/s: 603.823, result RPS: 1.662, result MiB/s: 0.000.
0.000% 0.559 sec.
10.000% 0.564 sec.
20.000% 0.568 sec.
30.000% 0.572 sec.
40.000% 0.575 sec.
50.000% 0.581 sec.
60.000% 0.592 sec.
70.000% 0.624 sec.
80.000% 0.639 sec.
90.000% 0.664 sec.
95.000% 0.686 sec.
99.000% 0.711 sec.
99.900% 0.715 sec.
99.990% 0.716 sec.
```
After
```
QPS: 1.861, RPS: 5582303.107, MiB/s: 676.110, result RPS: 1.861, result MiB/s: 0.000.
0.000% 0.510 sec.
10.000% 0.514 sec.
20.000% 0.517 sec.
30.000% 0.521 sec.
40.000% 0.523 sec.
50.000% 0.527 sec.
60.000% 0.530 sec.
70.000% 0.539 sec.
80.000% 0.558 sec.
90.000% 0.584 sec.
95.000% 0.589 sec.
99.000% 0.608 sec.
99.900% 0.655 sec.
99.990% 0.663 sec.
```
2018-08-27 19:14:15 +00:00
appendToStringOrVector ( s , buf , buf . buffer ( ) . end ( ) ) ;
2018-08-25 15:32:55 +00:00
buf . position ( ) = buf . buffer ( ) . end ( ) ;
2017-04-01 07:20:54 +00:00
}
2015-09-08 14:24:25 +00:00
}
2013-01-05 10:07:01 +00:00
2019-02-27 16:41:51 +00:00
2016-02-16 16:39:39 +00:00
void readStringUntilEOF ( String & s , ReadBuffer & buf )
{
2017-04-01 07:20:54 +00:00
s . clear ( ) ;
readStringUntilEOFInto ( s , buf ) ;
2016-02-16 16:39:39 +00:00
}
2019-02-27 16:41:51 +00:00
template < typename Vector >
void readEscapedStringUntilEOLInto ( Vector & s , ReadBuffer & buf )
{
while ( ! buf . eof ( ) )
{
char * next_pos = find_first_symbols < ' \n ' , ' \\ ' > ( buf . position ( ) , buf . buffer ( ) . end ( ) ) ;
appendToStringOrVector ( s , buf , next_pos ) ;
buf . position ( ) = next_pos ;
if ( ! buf . hasPendingData ( ) )
continue ;
if ( * buf . position ( ) = = ' \n ' )
return ;
if ( * buf . position ( ) = = ' \\ ' )
parseComplexEscapeSequence ( s , buf ) ;
}
}
void readEscapedStringUntilEOL ( String & s , ReadBuffer & buf )
{
s . clear ( ) ;
readEscapedStringUntilEOLInto ( s , buf ) ;
}
2016-04-15 00:33:21 +00:00
template void readStringUntilEOFInto < PaddedPODArray < UInt8 > > ( PaddedPODArray < UInt8 > & s , ReadBuffer & buf ) ;
2016-02-16 16:39:39 +00:00
2017-03-25 20:12:56 +00:00
/** Parse the escape sequence, which can be simple (one character after backslash) or more complex (multiple characters).
* It is assumed that the cursor is located on the ` \ ` symbol
2015-11-25 03:11:17 +00:00
*/
2022-12-07 21:19:27 +00:00
template < typename Vector , typename ReturnType = void >
static ReturnType parseComplexEscapeSequence ( Vector & s , ReadBuffer & buf )
2015-11-25 03:11:17 +00:00
{
2017-04-01 07:20:54 +00:00
+ + buf . position ( ) ;
if ( buf . eof ( ) )
2022-12-07 21:19:27 +00:00
{
if constexpr ( std : : is_same_v < ReturnType , void > )
throw Exception ( " Cannot parse escape sequence " , ErrorCodes : : CANNOT_PARSE_ESCAPE_SEQUENCE ) ;
else
2022-12-12 22:00:45 +00:00
return ReturnType ( false ) ;
2022-12-07 21:19:27 +00:00
}
2017-04-01 07:20:54 +00:00
2020-05-26 14:24:20 +00:00
char char_after_backslash = * buf . position ( ) ;
if ( char_after_backslash = = ' x ' )
2017-04-01 07:20:54 +00:00
{
+ + buf . position ( ) ;
2017-08-09 01:34:01 +00:00
/// escape sequence of the form \xAA
char hex_code [ 2 ] ;
readPODBinary ( hex_code , buf ) ;
s . push_back ( unhex2 ( hex_code ) ) ;
2017-04-01 07:20:54 +00:00
}
2020-05-26 14:24:20 +00:00
else if ( char_after_backslash = = ' N ' )
2017-04-01 07:20:54 +00:00
{
/// Support for NULLs: \N sequence must be parsed as empty string.
+ + buf . position ( ) ;
}
else
{
/// The usual escape sequence of a single character.
2020-05-26 14:24:20 +00:00
char decoded_char = parseEscapeSequence ( char_after_backslash ) ;
/// For convenience using LIKE and regular expressions,
/// we leave backslash when user write something like 'Hello 100\%':
/// it is parsed like Hello 100\% instead of Hello 100%
2020-05-26 21:48:29 +00:00
if ( decoded_char ! = ' \\ '
& & decoded_char ! = ' \' '
& & decoded_char ! = ' " '
& & decoded_char ! = ' ` ' /// MySQL style identifiers
& & decoded_char ! = ' / ' /// JavaScript in HTML
2022-04-15 22:20:47 +00:00
& & decoded_char ! = ' = ' /// TSKV format invented somewhere
2020-05-26 21:48:29 +00:00
& & ! isControlASCII ( decoded_char ) )
{
2020-05-26 14:24:20 +00:00
s . push_back ( ' \\ ' ) ;
2020-05-26 21:48:29 +00:00
}
2020-05-26 14:24:20 +00:00
s . push_back ( decoded_char ) ;
2017-04-01 07:20:54 +00:00
+ + buf . position ( ) ;
}
2022-12-07 21:19:27 +00:00
return ReturnType ( true ) ;
2015-11-25 03:11:17 +00:00
}
2017-08-09 01:34:01 +00:00
template < typename Vector , typename ReturnType >
static ReturnType parseJSONEscapeSequence ( Vector & s , ReadBuffer & buf )
2016-02-18 11:44:50 +00:00
{
2017-12-25 04:01:46 +00:00
static constexpr bool throw_exception = std : : is_same_v < ReturnType , void > ;
2017-08-09 01:34:01 +00:00
2018-07-16 06:09:27 +00:00
auto error = [ ] ( const char * message [[maybe_unused]], int code [[maybe_unused]] )
2017-08-09 01:34:01 +00:00
{
2018-07-14 23:39:00 +00:00
if constexpr ( throw_exception )
2017-08-09 01:34:01 +00:00
throw Exception ( message , code ) ;
return ReturnType ( false ) ;
} ;
2017-04-01 07:20:54 +00:00
+ + buf . position ( ) ;
2021-07-14 11:59:06 +00:00
2017-04-01 07:20:54 +00:00
if ( buf . eof ( ) )
2017-08-09 01:34:01 +00:00
return error ( " Cannot parse escape sequence " , ErrorCodes : : CANNOT_PARSE_ESCAPE_SEQUENCE ) ;
2017-04-01 07:20:54 +00:00
2021-07-14 11:59:06 +00:00
assert ( buf . hasPendingData ( ) ) ;
2017-08-09 01:34:01 +00:00
switch ( * buf . position ( ) )
2017-04-01 07:20:54 +00:00
{
case ' " ' :
s . push_back ( ' " ' ) ;
break ;
case ' \\ ' :
s . push_back ( ' \\ ' ) ;
break ;
case ' / ' :
s . push_back ( ' / ' ) ;
break ;
case ' b ' :
s . push_back ( ' \b ' ) ;
break ;
case ' f ' :
s . push_back ( ' \f ' ) ;
break ;
case ' n ' :
s . push_back ( ' \n ' ) ;
break ;
case ' r ' :
s . push_back ( ' \r ' ) ;
break ;
case ' t ' :
s . push_back ( ' \t ' ) ;
break ;
case ' u ' :
{
+ + buf . position ( ) ;
char hex_code [ 4 ] ;
2017-08-09 01:34:01 +00:00
if ( 4 ! = buf . read ( hex_code , 4 ) )
return error ( " Cannot parse escape sequence: less than four bytes after \\ u " , ErrorCodes : : CANNOT_PARSE_ESCAPE_SEQUENCE ) ;
2017-04-01 07:20:54 +00:00
/// \u0000 - special case
2017-08-09 01:34:01 +00:00
if ( 0 = = memcmp ( hex_code , " 0000 " , 4 ) )
2017-04-01 07:20:54 +00:00
{
s . push_back ( 0 ) ;
2017-08-09 01:34:01 +00:00
return ReturnType ( true ) ;
2017-04-01 07:20:54 +00:00
}
2017-08-09 01:34:01 +00:00
UInt16 code_point = unhex4 ( hex_code ) ;
2017-04-01 07:20:54 +00:00
if ( code_point < = 0x7F )
{
s . push_back ( code_point ) ;
}
2017-08-09 01:34:01 +00:00
else if ( code_point < = 0x07FF )
2017-04-01 07:20:54 +00:00
{
s . push_back ( ( ( code_point > > 6 ) & 0x1F ) | 0xC0 ) ;
s . push_back ( ( code_point & 0x3F ) | 0x80 ) ;
}
else
{
/// Surrogate pair.
if ( code_point > = 0xD800 & & code_point < = 0xDBFF )
{
2017-08-09 01:34:01 +00:00
if ( ! checkString ( " \\ u " , buf ) )
return error ( " Cannot parse escape sequence: missing second part of surrogate pair " , ErrorCodes : : CANNOT_PARSE_ESCAPE_SEQUENCE ) ;
2017-04-01 07:20:54 +00:00
char second_hex_code [ 4 ] ;
2017-08-09 01:34:01 +00:00
if ( 4 ! = buf . read ( second_hex_code , 4 ) )
return error ( " Cannot parse escape sequence: less than four bytes after \\ u of second part of surrogate pair " ,
ErrorCodes : : CANNOT_PARSE_ESCAPE_SEQUENCE ) ;
2017-04-01 07:20:54 +00:00
2017-08-09 01:34:01 +00:00
UInt16 second_code_point = unhex4 ( second_hex_code ) ;
2017-04-01 07:20:54 +00:00
if ( second_code_point > = 0xDC00 & & second_code_point < = 0xDFFF )
{
UInt32 full_code_point = 0x10000 + ( code_point - 0xD800 ) * 1024 + ( second_code_point - 0xDC00 ) ;
s . push_back ( ( ( full_code_point > > 18 ) & 0x07 ) | 0xF0 ) ;
s . push_back ( ( ( full_code_point > > 12 ) & 0x3F ) | 0x80 ) ;
s . push_back ( ( ( full_code_point > > 6 ) & 0x3F ) | 0x80 ) ;
s . push_back ( ( full_code_point & 0x3F ) | 0x80 ) ;
}
else
2017-08-09 01:34:01 +00:00
return error ( " Incorrect surrogate pair of unicode escape sequences in JSON " , ErrorCodes : : CANNOT_PARSE_ESCAPE_SEQUENCE ) ;
2017-04-01 07:20:54 +00:00
}
else
{
s . push_back ( ( ( code_point > > 12 ) & 0x0F ) | 0xE0 ) ;
s . push_back ( ( ( code_point > > 6 ) & 0x3F ) | 0x80 ) ;
s . push_back ( ( code_point & 0x3F ) | 0x80 ) ;
}
}
2017-08-09 01:34:01 +00:00
return ReturnType ( true ) ;
2017-04-01 07:20:54 +00:00
}
default :
s . push_back ( * buf . position ( ) ) ;
break ;
}
+ + buf . position ( ) ;
2017-08-09 01:34:01 +00:00
return ReturnType ( true ) ;
2016-02-18 11:44:50 +00:00
}
2016-02-16 16:39:39 +00:00
template < typename Vector >
void readEscapedStringInto ( Vector & s , ReadBuffer & buf )
2010-06-04 18:25:25 +00:00
{
2017-04-01 07:20:54 +00:00
while ( ! buf . eof ( ) )
{
2018-08-25 15:32:55 +00:00
char * next_pos = find_first_symbols < ' \t ' , ' \n ' , ' \\ ' > ( buf . position ( ) , buf . buffer ( ) . end ( ) ) ;
2010-06-04 18:25:25 +00:00
Padding for IO buffers.
Testing data
```
select 'aaaaaaaa','bbbbbbbb','cccccccc','dddddddd','eeeeeeee','ffffffff','gggg','hhh' from numbers(3000000) into outfile '/tmp/test.tsv'
```
Testing command
```
echo "select count() from file('/tmp/test.tsv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
TSV parser has less overhead than CSV, using it would better unveil the benefits of memcpySmall.
Before
```
QPS: 1.662, RPS: 4985463.906, MiB/s: 603.823, result RPS: 1.662, result MiB/s: 0.000.
0.000% 0.559 sec.
10.000% 0.564 sec.
20.000% 0.568 sec.
30.000% 0.572 sec.
40.000% 0.575 sec.
50.000% 0.581 sec.
60.000% 0.592 sec.
70.000% 0.624 sec.
80.000% 0.639 sec.
90.000% 0.664 sec.
95.000% 0.686 sec.
99.000% 0.711 sec.
99.900% 0.715 sec.
99.990% 0.716 sec.
```
After
```
QPS: 1.861, RPS: 5582303.107, MiB/s: 676.110, result RPS: 1.861, result MiB/s: 0.000.
0.000% 0.510 sec.
10.000% 0.514 sec.
20.000% 0.517 sec.
30.000% 0.521 sec.
40.000% 0.523 sec.
50.000% 0.527 sec.
60.000% 0.530 sec.
70.000% 0.539 sec.
80.000% 0.558 sec.
90.000% 0.584 sec.
95.000% 0.589 sec.
99.000% 0.608 sec.
99.900% 0.655 sec.
99.990% 0.663 sec.
```
2018-08-27 19:14:15 +00:00
appendToStringOrVector ( s , buf , next_pos ) ;
2018-08-25 15:32:55 +00:00
buf . position ( ) = next_pos ;
2010-06-04 18:25:25 +00:00
2017-04-01 07:20:54 +00:00
if ( ! buf . hasPendingData ( ) )
continue ;
2011-12-26 02:17:33 +00:00
2017-04-01 07:20:54 +00:00
if ( * buf . position ( ) = = ' \t ' | | * buf . position ( ) = = ' \n ' )
return ;
2010-06-04 18:25:25 +00:00
2017-04-01 07:20:54 +00:00
if ( * buf . position ( ) = = ' \\ ' )
parseComplexEscapeSequence ( s , buf ) ;
}
2010-06-04 18:25:25 +00:00
}
2016-02-16 16:39:39 +00:00
void readEscapedString ( String & s , ReadBuffer & buf )
{
2017-04-01 07:20:54 +00:00
s . clear ( ) ;
readEscapedStringInto ( s , buf ) ;
2016-02-16 16:39:39 +00:00
}
2016-04-15 00:33:21 +00:00
template void readEscapedStringInto < PaddedPODArray < UInt8 > > ( PaddedPODArray < UInt8 > & s , ReadBuffer & buf ) ;
2020-10-29 17:22:48 +00:00
template void readEscapedStringInto < NullOutput > ( NullOutput & s , ReadBuffer & buf ) ;
2016-02-16 16:39:39 +00:00
2011-06-15 18:54:18 +00:00
2017-06-25 03:43:37 +00:00
/** If enable_sql_style_quoting == true,
* strings like ' abc ' ' def ' will be parsed as abc ' def .
* Please note , that even with SQL style quoting enabled ,
* backslash escape sequences are also parsed ,
* that could be slightly confusing .
*/
2022-12-07 21:19:27 +00:00
template < char quote , bool enable_sql_style_quoting , typename Vector , typename ReturnType = void >
static ReturnType readAnyQuotedStringInto ( Vector & s , ReadBuffer & buf )
2010-06-04 18:25:25 +00:00
{
2022-12-07 21:19:27 +00:00
static constexpr bool throw_exception = std : : is_same_v < ReturnType , void > ;
2017-04-01 07:20:54 +00:00
if ( buf . eof ( ) | | * buf . position ( ) ! = quote )
2020-11-18 10:38:30 +00:00
{
2022-12-07 21:19:27 +00:00
if constexpr ( throw_exception )
throw ParsingException ( ErrorCodes : : CANNOT_PARSE_QUOTED_STRING ,
" Cannot parse quoted string: expected opening quote '{}', got '{}' " ,
std : : string { quote } , buf . eof ( ) ? " EOF " : std : : string { * buf . position ( ) } ) ;
else
2022-12-12 22:00:45 +00:00
return ReturnType ( false ) ;
2020-11-18 10:38:30 +00:00
}
2017-04-01 07:20:54 +00:00
+ + buf . position ( ) ;
2010-06-04 18:25:25 +00:00
2017-04-01 07:20:54 +00:00
while ( ! buf . eof ( ) )
{
2018-08-25 15:32:55 +00:00
char * next_pos = find_first_symbols < ' \\ ' , quote > ( buf . position ( ) , buf . buffer ( ) . end ( ) ) ;
2010-06-04 18:25:25 +00:00
Padding for IO buffers.
Testing data
```
select 'aaaaaaaa','bbbbbbbb','cccccccc','dddddddd','eeeeeeee','ffffffff','gggg','hhh' from numbers(3000000) into outfile '/tmp/test.tsv'
```
Testing command
```
echo "select count() from file('/tmp/test.tsv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
TSV parser has less overhead than CSV, using it would better unveil the benefits of memcpySmall.
Before
```
QPS: 1.662, RPS: 4985463.906, MiB/s: 603.823, result RPS: 1.662, result MiB/s: 0.000.
0.000% 0.559 sec.
10.000% 0.564 sec.
20.000% 0.568 sec.
30.000% 0.572 sec.
40.000% 0.575 sec.
50.000% 0.581 sec.
60.000% 0.592 sec.
70.000% 0.624 sec.
80.000% 0.639 sec.
90.000% 0.664 sec.
95.000% 0.686 sec.
99.000% 0.711 sec.
99.900% 0.715 sec.
99.990% 0.716 sec.
```
After
```
QPS: 1.861, RPS: 5582303.107, MiB/s: 676.110, result RPS: 1.861, result MiB/s: 0.000.
0.000% 0.510 sec.
10.000% 0.514 sec.
20.000% 0.517 sec.
30.000% 0.521 sec.
40.000% 0.523 sec.
50.000% 0.527 sec.
60.000% 0.530 sec.
70.000% 0.539 sec.
80.000% 0.558 sec.
90.000% 0.584 sec.
95.000% 0.589 sec.
99.000% 0.608 sec.
99.900% 0.655 sec.
99.990% 0.663 sec.
```
2018-08-27 19:14:15 +00:00
appendToStringOrVector ( s , buf , next_pos ) ;
2018-08-25 15:32:55 +00:00
buf . position ( ) = next_pos ;
2015-02-07 23:13:04 +00:00
2017-04-01 07:20:54 +00:00
if ( ! buf . hasPendingData ( ) )
continue ;
2011-12-26 02:17:33 +00:00
2017-04-01 07:20:54 +00:00
if ( * buf . position ( ) = = quote )
{
+ + buf . position ( ) ;
2017-06-25 03:43:37 +00:00
if ( enable_sql_style_quoting & & ! buf . eof ( ) & & * buf . position ( ) = = quote )
{
s . push_back ( quote ) ;
+ + buf . position ( ) ;
continue ;
}
2022-12-07 21:19:27 +00:00
return ReturnType ( true ) ;
2017-04-01 07:20:54 +00:00
}
2010-06-04 18:25:25 +00:00
2017-04-01 07:20:54 +00:00
if ( * buf . position ( ) = = ' \\ ' )
2022-12-07 21:19:27 +00:00
{
if constexpr ( throw_exception )
parseComplexEscapeSequence < Vector , ReturnType > ( s , buf ) ;
else
{
if ( ! parseComplexEscapeSequence < Vector , ReturnType > ( s , buf ) )
2022-12-12 22:00:45 +00:00
return ReturnType ( false ) ;
2022-12-07 21:19:27 +00:00
}
}
2017-04-01 07:20:54 +00:00
}
2010-06-04 18:25:25 +00:00
2022-12-07 21:19:27 +00:00
if constexpr ( throw_exception )
throw ParsingException ( " Cannot parse quoted string: expected closing quote " ,
ErrorCodes : : CANNOT_PARSE_QUOTED_STRING ) ;
else
2022-12-12 22:00:45 +00:00
return ReturnType ( false ) ;
2010-06-04 18:25:25 +00:00
}
2017-06-25 03:43:37 +00:00
template < bool enable_sql_style_quoting , typename Vector >
2016-02-16 16:39:39 +00:00
void readQuotedStringInto ( Vector & s , ReadBuffer & buf )
{
2017-06-25 03:43:37 +00:00
readAnyQuotedStringInto < ' \' ' , enable_sql_style_quoting > ( s , buf ) ;
2016-02-16 16:39:39 +00:00
}
2022-12-07 21:19:27 +00:00
template < typename Vector >
bool tryReadQuotedStringInto ( Vector & s , ReadBuffer & buf )
{
return readAnyQuotedStringInto < ' \' ' , false , Vector , bool > ( s , buf ) ;
}
template bool tryReadQuotedStringInto ( String & s , ReadBuffer & buf ) ;
2017-06-25 03:43:37 +00:00
template < bool enable_sql_style_quoting , typename Vector >
2016-02-16 16:39:39 +00:00
void readDoubleQuotedStringInto ( Vector & s , ReadBuffer & buf )
{
2017-06-25 03:43:37 +00:00
readAnyQuotedStringInto < ' " ' , enable_sql_style_quoting > ( s , buf ) ;
2016-02-16 16:39:39 +00:00
}
2017-06-25 03:43:37 +00:00
template < bool enable_sql_style_quoting , typename Vector >
2016-02-16 16:39:39 +00:00
void readBackQuotedStringInto ( Vector & s , ReadBuffer & buf )
{
2017-06-25 03:43:37 +00:00
readAnyQuotedStringInto < ' ` ' , enable_sql_style_quoting > ( s , buf ) ;
2016-02-16 16:39:39 +00:00
}
2011-06-15 18:54:18 +00:00
void readQuotedString ( String & s , ReadBuffer & buf )
{
2017-04-01 07:20:54 +00:00
s . clear ( ) ;
2017-06-25 03:43:37 +00:00
readQuotedStringInto < false > ( s , buf ) ;
2011-06-15 18:54:18 +00:00
}
2017-06-25 03:43:37 +00:00
void readQuotedStringWithSQLStyle ( String & s , ReadBuffer & buf )
{
s . clear ( ) ;
readQuotedStringInto < true > ( s , buf ) ;
}
template void readQuotedStringInto < true > ( PaddedPODArray < UInt8 > & s , ReadBuffer & buf ) ;
2022-03-16 15:28:09 +00:00
template void readQuotedStringInto < true > ( String & s , ReadBuffer & buf ) ;
2022-05-03 13:56:25 +00:00
template void readQuotedStringInto < false > ( String & s , ReadBuffer & buf ) ;
2020-10-29 17:22:48 +00:00
template void readDoubleQuotedStringInto < false > ( NullOutput & s , ReadBuffer & buf ) ;
2022-05-03 13:56:25 +00:00
template void readDoubleQuotedStringInto < false > ( String & s , ReadBuffer & buf ) ;
2022-05-02 12:07:31 +00:00
template void readBackQuotedStringInto < false > ( String & s , ReadBuffer & buf ) ;
2016-02-16 16:39:39 +00:00
2011-06-15 18:54:18 +00:00
void readDoubleQuotedString ( String & s , ReadBuffer & buf )
{
2017-04-01 07:20:54 +00:00
s . clear ( ) ;
2017-06-25 03:43:37 +00:00
readDoubleQuotedStringInto < false > ( s , buf ) ;
2011-06-15 18:54:18 +00:00
}
2017-06-25 03:43:37 +00:00
void readDoubleQuotedStringWithSQLStyle ( String & s , ReadBuffer & buf )
{
s . clear ( ) ;
readDoubleQuotedStringInto < true > ( s , buf ) ;
}
2016-02-16 16:39:39 +00:00
2011-11-01 17:57:37 +00:00
void readBackQuotedString ( String & s , ReadBuffer & buf )
{
2017-04-01 07:20:54 +00:00
s . clear ( ) ;
2017-06-25 03:43:37 +00:00
readBackQuotedStringInto < false > ( s , buf ) ;
2011-11-01 17:57:37 +00:00
}
2017-06-25 03:43:37 +00:00
void readBackQuotedStringWithSQLStyle ( String & s , ReadBuffer & buf )
{
s . clear ( ) ;
readBackQuotedStringInto < true > ( s , buf ) ;
}
2012-05-08 05:42:05 +00:00
2022-05-20 14:57:27 +00:00
template < typename T >
concept WithResize = requires ( T value )
{
{ value . resize ( 1 ) } ;
{ value . size ( ) } - > std : : integral < > ;
} ;
2016-02-07 08:42:21 +00:00
2016-02-16 16:39:39 +00:00
template < typename Vector >
2018-07-04 21:00:50 +00:00
void readCSVStringInto ( Vector & s , ReadBuffer & buf , const FormatSettings : : CSV & settings )
2016-02-16 16:39:39 +00:00
{
2022-08-29 11:18:53 +00:00
/// Empty string
2017-04-01 07:20:54 +00:00
if ( buf . eof ( ) )
2022-08-29 11:18:53 +00:00
return ;
2017-04-01 07:20:54 +00:00
2018-07-04 21:09:58 +00:00
const char delimiter = settings . delimiter ;
2018-06-29 07:34:12 +00:00
const char maybe_quote = * buf . position ( ) ;
2022-11-17 15:21:38 +00:00
const String & custom_delimiter = settings . custom_delimiter ;
2017-04-01 07:20:54 +00:00
/// Emptiness and not even in quotation marks.
2022-11-17 15:21:38 +00:00
if ( custom_delimiter . empty ( ) & & maybe_quote = = delimiter )
2017-04-01 07:20:54 +00:00
return ;
2018-07-04 21:09:58 +00:00
if ( ( settings . allow_single_quotes & & maybe_quote = = ' \' ' ) | | ( settings . allow_double_quotes & & maybe_quote = = ' " ' ) )
2017-04-01 07:20:54 +00:00
{
+ + buf . position ( ) ;
/// The quoted case. We are looking for the next quotation mark.
while ( ! buf . eof ( ) )
{
2018-08-25 15:32:55 +00:00
char * next_pos = reinterpret_cast < char * > ( memchr ( buf . position ( ) , maybe_quote , buf . buffer ( ) . end ( ) - buf . position ( ) ) ) ;
2017-04-01 07:20:54 +00:00
if ( nullptr = = next_pos )
next_pos = buf . buffer ( ) . end ( ) ;
Padding for IO buffers.
Testing data
```
select 'aaaaaaaa','bbbbbbbb','cccccccc','dddddddd','eeeeeeee','ffffffff','gggg','hhh' from numbers(3000000) into outfile '/tmp/test.tsv'
```
Testing command
```
echo "select count() from file('/tmp/test.tsv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
TSV parser has less overhead than CSV, using it would better unveil the benefits of memcpySmall.
Before
```
QPS: 1.662, RPS: 4985463.906, MiB/s: 603.823, result RPS: 1.662, result MiB/s: 0.000.
0.000% 0.559 sec.
10.000% 0.564 sec.
20.000% 0.568 sec.
30.000% 0.572 sec.
40.000% 0.575 sec.
50.000% 0.581 sec.
60.000% 0.592 sec.
70.000% 0.624 sec.
80.000% 0.639 sec.
90.000% 0.664 sec.
95.000% 0.686 sec.
99.000% 0.711 sec.
99.900% 0.715 sec.
99.990% 0.716 sec.
```
After
```
QPS: 1.861, RPS: 5582303.107, MiB/s: 676.110, result RPS: 1.861, result MiB/s: 0.000.
0.000% 0.510 sec.
10.000% 0.514 sec.
20.000% 0.517 sec.
30.000% 0.521 sec.
40.000% 0.523 sec.
50.000% 0.527 sec.
60.000% 0.530 sec.
70.000% 0.539 sec.
80.000% 0.558 sec.
90.000% 0.584 sec.
95.000% 0.589 sec.
99.000% 0.608 sec.
99.900% 0.655 sec.
99.990% 0.663 sec.
```
2018-08-27 19:14:15 +00:00
appendToStringOrVector ( s , buf , next_pos ) ;
2018-08-25 15:32:55 +00:00
buf . position ( ) = next_pos ;
2017-04-01 07:20:54 +00:00
if ( ! buf . hasPendingData ( ) )
continue ;
/// Now there is a quotation mark under the cursor. Is there any following?
+ + buf . position ( ) ;
if ( buf . eof ( ) )
return ;
if ( * buf . position ( ) = = maybe_quote )
{
s . push_back ( maybe_quote ) ;
+ + buf . position ( ) ;
continue ;
}
return ;
}
}
else
{
2022-11-17 15:21:38 +00:00
/// If custom_delimiter is specified, we should read until first occurrences of
/// custom_delimiter in buffer.
if ( ! custom_delimiter . empty ( ) )
{
PeekableReadBuffer * peekable_buf = dynamic_cast < PeekableReadBuffer * > ( & buf ) ;
if ( ! peekable_buf )
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Reading CSV string with custom delimiter is allowed only when using PeekableReadBuffer " ) ;
while ( true )
{
if ( peekable_buf - > eof ( ) )
throw Exception ( ErrorCodes : : INCORRECT_DATA , " Unexpected EOF while reading CSV string, expected custom delimiter \" {} \" " , custom_delimiter ) ;
char * next_pos = reinterpret_cast < char * > ( memchr ( peekable_buf - > position ( ) , custom_delimiter [ 0 ] , peekable_buf - > available ( ) ) ) ;
if ( ! next_pos )
next_pos = peekable_buf - > buffer ( ) . end ( ) ;
appendToStringOrVector ( s , * peekable_buf , next_pos ) ;
peekable_buf - > position ( ) = next_pos ;
if ( ! buf . hasPendingData ( ) )
continue ;
{
PeekableReadBufferCheckpoint checkpoint { * peekable_buf , true } ;
if ( checkString ( custom_delimiter , * peekable_buf ) )
return ;
}
s . push_back ( * peekable_buf - > position ( ) ) ;
+ + peekable_buf - > position ( ) ;
}
return ;
}
2017-04-01 07:20:54 +00:00
/// Unquoted case. Look for delimiter or \r or \n.
while ( ! buf . eof ( ) )
{
2018-08-25 15:32:55 +00:00
char * next_pos = buf . position ( ) ;
Enable sse2 for CSV parsing.
Testing data
```
select 'aaaaaaaa,bbbbbbbb,cccccccc,dddddddd,eeeeeeee,ffffffff,gggg,hhh' from numbers(3000000) into outfile '/tmp/test.csv'
```
Testing command
```
echo "select count() from file('/tmp/test.csv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
Before
```
QPS: 1.317, RPS: 3949749.687, MiB/s: 478.380, result RPS: 1.317, result MiB/s: 0.000.
0.000% 0.704 sec.
10.000% 0.712 sec.
20.000% 0.718 sec.
30.000% 0.726 sec.
40.000% 0.739 sec.
50.000% 0.754 sec.
60.000% 0.770 sec.
70.000% 0.788 sec.
80.000% 0.798 sec.
90.000% 0.815 sec.
95.000% 0.826 sec.
99.000% 0.850 sec.
99.900% 0.857 sec.
99.990% 0.858 sec.
```
After
```
QPS: 1.533, RPS: 4598308.336, MiB/s: 556.932, result RPS: 1.533, result MiB/s: 0.000.
0.000% 0.626 sec.
10.000% 0.635 sec.
20.000% 0.639 sec.
30.000% 0.642 sec.
40.000% 0.643 sec.
50.000% 0.645 sec.
60.000% 0.649 sec.
70.000% 0.652 sec.
80.000% 0.658 sec.
90.000% 0.682 sec.
95.000% 0.710 sec.
99.000% 0.727 sec.
99.900% 0.733 sec.
99.990% 0.734 sec.
```
2018-08-28 11:03:52 +00:00
2018-08-30 21:13:34 +00:00
[ & ] ( )
{
2019-01-04 12:10:00 +00:00
# ifdef __SSE2__
Enable sse2 for CSV parsing.
Testing data
```
select 'aaaaaaaa,bbbbbbbb,cccccccc,dddddddd,eeeeeeee,ffffffff,gggg,hhh' from numbers(3000000) into outfile '/tmp/test.csv'
```
Testing command
```
echo "select count() from file('/tmp/test.csv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
Before
```
QPS: 1.317, RPS: 3949749.687, MiB/s: 478.380, result RPS: 1.317, result MiB/s: 0.000.
0.000% 0.704 sec.
10.000% 0.712 sec.
20.000% 0.718 sec.
30.000% 0.726 sec.
40.000% 0.739 sec.
50.000% 0.754 sec.
60.000% 0.770 sec.
70.000% 0.788 sec.
80.000% 0.798 sec.
90.000% 0.815 sec.
95.000% 0.826 sec.
99.000% 0.850 sec.
99.900% 0.857 sec.
99.990% 0.858 sec.
```
After
```
QPS: 1.533, RPS: 4598308.336, MiB/s: 556.932, result RPS: 1.533, result MiB/s: 0.000.
0.000% 0.626 sec.
10.000% 0.635 sec.
20.000% 0.639 sec.
30.000% 0.642 sec.
40.000% 0.643 sec.
50.000% 0.645 sec.
60.000% 0.649 sec.
70.000% 0.652 sec.
80.000% 0.658 sec.
90.000% 0.682 sec.
95.000% 0.710 sec.
99.000% 0.727 sec.
99.900% 0.733 sec.
99.990% 0.734 sec.
```
2018-08-28 11:03:52 +00:00
auto rc = _mm_set1_epi8 ( ' \r ' ) ;
auto nc = _mm_set1_epi8 ( ' \n ' ) ;
auto dc = _mm_set1_epi8 ( delimiter ) ;
for ( ; next_pos + 15 < buf . buffer ( ) . end ( ) ; next_pos + = 16 )
{
__m128i bytes = _mm_loadu_si128 ( reinterpret_cast < const __m128i * > ( next_pos ) ) ;
auto eq = _mm_or_si128 ( _mm_or_si128 ( _mm_cmpeq_epi8 ( bytes , rc ) , _mm_cmpeq_epi8 ( bytes , nc ) ) , _mm_cmpeq_epi8 ( bytes , dc ) ) ;
uint16_t bit_mask = _mm_movemask_epi8 ( eq ) ;
if ( bit_mask )
{
2022-07-31 14:34:05 +00:00
next_pos + = std : : countr_zero ( bit_mask ) ;
Enable sse2 for CSV parsing.
Testing data
```
select 'aaaaaaaa,bbbbbbbb,cccccccc,dddddddd,eeeeeeee,ffffffff,gggg,hhh' from numbers(3000000) into outfile '/tmp/test.csv'
```
Testing command
```
echo "select count() from file('/tmp/test.csv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
Before
```
QPS: 1.317, RPS: 3949749.687, MiB/s: 478.380, result RPS: 1.317, result MiB/s: 0.000.
0.000% 0.704 sec.
10.000% 0.712 sec.
20.000% 0.718 sec.
30.000% 0.726 sec.
40.000% 0.739 sec.
50.000% 0.754 sec.
60.000% 0.770 sec.
70.000% 0.788 sec.
80.000% 0.798 sec.
90.000% 0.815 sec.
95.000% 0.826 sec.
99.000% 0.850 sec.
99.900% 0.857 sec.
99.990% 0.858 sec.
```
After
```
QPS: 1.533, RPS: 4598308.336, MiB/s: 556.932, result RPS: 1.533, result MiB/s: 0.000.
0.000% 0.626 sec.
10.000% 0.635 sec.
20.000% 0.639 sec.
30.000% 0.642 sec.
40.000% 0.643 sec.
50.000% 0.645 sec.
60.000% 0.649 sec.
70.000% 0.652 sec.
80.000% 0.658 sec.
90.000% 0.682 sec.
95.000% 0.710 sec.
99.000% 0.727 sec.
99.900% 0.733 sec.
99.990% 0.734 sec.
```
2018-08-28 11:03:52 +00:00
return ;
}
}
2022-06-15 13:19:29 +00:00
# elif defined(__aarch64__) && defined(__ARM_NEON)
auto rc = vdupq_n_u8 ( ' \r ' ) ;
auto nc = vdupq_n_u8 ( ' \n ' ) ;
auto dc = vdupq_n_u8 ( delimiter ) ;
2022-06-15 14:40:21 +00:00
/// Returns a 64 bit mask of nibbles (4 bits for each byte).
2022-06-15 13:19:29 +00:00
auto get_nibble_mask = [ ] ( uint8x16_t input ) - > uint64_t
{ return vget_lane_u64 ( vreinterpret_u64_u8 ( vshrn_n_u16 ( vreinterpretq_u16_u8 ( input ) , 4 ) ) , 0 ) ; } ;
for ( ; next_pos + 15 < buf . buffer ( ) . end ( ) ; next_pos + = 16 )
{
uint8x16_t bytes = vld1q_u8 ( reinterpret_cast < const uint8_t * > ( next_pos ) ) ;
auto eq = vorrq_u8 ( vorrq_u8 ( vceqq_u8 ( bytes , rc ) , vceqq_u8 ( bytes , nc ) ) , vceqq_u8 ( bytes , dc ) ) ;
uint64_t bit_mask = get_nibble_mask ( eq ) ;
if ( bit_mask )
{
2022-07-31 14:34:05 +00:00
next_pos + = std : : countr_zero ( bit_mask ) > > 2 ;
2022-06-15 13:19:29 +00:00
return ;
}
}
Enable sse2 for CSV parsing.
Testing data
```
select 'aaaaaaaa,bbbbbbbb,cccccccc,dddddddd,eeeeeeee,ffffffff,gggg,hhh' from numbers(3000000) into outfile '/tmp/test.csv'
```
Testing command
```
echo "select count() from file('/tmp/test.csv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
Before
```
QPS: 1.317, RPS: 3949749.687, MiB/s: 478.380, result RPS: 1.317, result MiB/s: 0.000.
0.000% 0.704 sec.
10.000% 0.712 sec.
20.000% 0.718 sec.
30.000% 0.726 sec.
40.000% 0.739 sec.
50.000% 0.754 sec.
60.000% 0.770 sec.
70.000% 0.788 sec.
80.000% 0.798 sec.
90.000% 0.815 sec.
95.000% 0.826 sec.
99.000% 0.850 sec.
99.900% 0.857 sec.
99.990% 0.858 sec.
```
After
```
QPS: 1.533, RPS: 4598308.336, MiB/s: 556.932, result RPS: 1.533, result MiB/s: 0.000.
0.000% 0.626 sec.
10.000% 0.635 sec.
20.000% 0.639 sec.
30.000% 0.642 sec.
40.000% 0.643 sec.
50.000% 0.645 sec.
60.000% 0.649 sec.
70.000% 0.652 sec.
80.000% 0.658 sec.
90.000% 0.682 sec.
95.000% 0.710 sec.
99.000% 0.727 sec.
99.900% 0.733 sec.
99.990% 0.734 sec.
```
2018-08-28 11:03:52 +00:00
# endif
while ( next_pos < buf . buffer ( ) . end ( )
& & * next_pos ! = delimiter & & * next_pos ! = ' \r ' & & * next_pos ! = ' \n ' )
+ + next_pos ;
} ( ) ;
Padding for IO buffers.
Testing data
```
select 'aaaaaaaa','bbbbbbbb','cccccccc','dddddddd','eeeeeeee','ffffffff','gggg','hhh' from numbers(3000000) into outfile '/tmp/test.tsv'
```
Testing command
```
echo "select count() from file('/tmp/test.tsv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
TSV parser has less overhead than CSV, using it would better unveil the benefits of memcpySmall.
Before
```
QPS: 1.662, RPS: 4985463.906, MiB/s: 603.823, result RPS: 1.662, result MiB/s: 0.000.
0.000% 0.559 sec.
10.000% 0.564 sec.
20.000% 0.568 sec.
30.000% 0.572 sec.
40.000% 0.575 sec.
50.000% 0.581 sec.
60.000% 0.592 sec.
70.000% 0.624 sec.
80.000% 0.639 sec.
90.000% 0.664 sec.
95.000% 0.686 sec.
99.000% 0.711 sec.
99.900% 0.715 sec.
99.990% 0.716 sec.
```
After
```
QPS: 1.861, RPS: 5582303.107, MiB/s: 676.110, result RPS: 1.861, result MiB/s: 0.000.
0.000% 0.510 sec.
10.000% 0.514 sec.
20.000% 0.517 sec.
30.000% 0.521 sec.
40.000% 0.523 sec.
50.000% 0.527 sec.
60.000% 0.530 sec.
70.000% 0.539 sec.
80.000% 0.558 sec.
90.000% 0.584 sec.
95.000% 0.589 sec.
99.000% 0.608 sec.
99.900% 0.655 sec.
99.990% 0.663 sec.
```
2018-08-27 19:14:15 +00:00
appendToStringOrVector ( s , buf , next_pos ) ;
2018-08-25 15:32:55 +00:00
buf . position ( ) = next_pos ;
2017-04-01 07:20:54 +00:00
if ( ! buf . hasPendingData ( ) )
continue ;
2022-05-20 14:57:27 +00:00
if constexpr ( WithResize < Vector > )
2022-05-13 13:51:28 +00:00
{
/** CSV format can contain insignificant spaces and tabs.
2022-05-13 14:27:15 +00:00
* Usually the task of skipping them is for the calling code .
* But in this case , it will be difficult to do this , so remove the trailing whitespace by ourself .
*/
2022-05-13 13:51:28 +00:00
size_t size = s . size ( ) ;
while ( size > 0 & & ( s [ size - 1 ] = = ' ' | | s [ size - 1 ] = = ' \t ' ) )
- - size ;
2017-04-01 07:20:54 +00:00
2022-05-13 13:51:28 +00:00
s . resize ( size ) ;
}
2017-04-01 07:20:54 +00:00
return ;
}
}
2016-02-07 08:42:21 +00:00
}
2018-07-04 21:00:50 +00:00
void readCSVString ( String & s , ReadBuffer & buf , const FormatSettings : : CSV & settings )
2016-02-16 16:39:39 +00:00
{
2017-04-01 07:20:54 +00:00
s . clear ( ) ;
2018-07-04 21:00:50 +00:00
readCSVStringInto ( s , buf , settings ) ;
2016-02-16 16:39:39 +00:00
}
2021-12-15 11:30:57 +00:00
void readCSVField ( String & s , ReadBuffer & buf , const FormatSettings : : CSV & settings )
{
s . clear ( ) ;
bool add_quote = false ;
char quote = ' \' ' ;
if ( ! buf . eof ( ) & & ( * buf . position ( ) = = ' \' ' | | * buf . position ( ) = = ' " ' ) )
{
quote = * buf . position ( ) ;
s . push_back ( quote ) ;
add_quote = true ;
}
readCSVStringInto ( s , buf , settings ) ;
if ( add_quote )
s . push_back ( quote ) ;
}
2022-11-17 15:21:38 +00:00
void readCSVWithTwoPossibleDelimitersImpl ( String & s , PeekableReadBuffer & buf , const String & first_delimiter , const String & second_delimiter )
{
2022-12-08 20:00:10 +00:00
/// Check that delimiters are not empty.
if ( first_delimiter . empty ( ) | | second_delimiter . empty ( ) )
2022-12-08 20:46:22 +00:00
throw Exception ( ErrorCodes : : BAD_ARGUMENTS , " Cannot read CSV field with two possible delimiters, one of delimiters '{}' and '{}' is empty " , first_delimiter , second_delimiter ) ;
2022-12-08 20:00:10 +00:00
2022-11-17 15:21:38 +00:00
/// Read all data until first_delimiter or second_delimiter
while ( true )
{
if ( buf . eof ( ) )
throw Exception ( ErrorCodes : : INCORRECT_DATA , R " (Unexpected EOF while reading CSV string, expected on of delimiters " { } " or " { } " ) " , first_delimiter, second_delimiter) ;
char * next_pos = buf . position ( ) ;
while ( next_pos ! = buf . buffer ( ) . end ( ) & & * next_pos ! = first_delimiter [ 0 ] & & * next_pos ! = second_delimiter [ 0 ] )
+ + next_pos ;
appendToStringOrVector ( s , buf , next_pos ) ;
buf . position ( ) = next_pos ;
if ( ! buf . hasPendingData ( ) )
continue ;
if ( * buf . position ( ) = = first_delimiter [ 0 ] )
{
PeekableReadBufferCheckpoint checkpoint ( buf , true ) ;
if ( checkString ( first_delimiter , buf ) )
return ;
}
if ( * buf . position ( ) = = second_delimiter [ 0 ] )
{
PeekableReadBufferCheckpoint checkpoint ( buf , true ) ;
if ( checkString ( second_delimiter , buf ) )
return ;
}
s . push_back ( * buf . position ( ) ) ;
+ + buf . position ( ) ;
}
}
String readCSVStringWithTwoPossibleDelimiters ( PeekableReadBuffer & buf , const FormatSettings : : CSV & settings , const String & first_delimiter , const String & second_delimiter )
{
String res ;
/// If value is quoted, use regular CSV reading since we need to read only data inside quotes.
if ( ! buf . eof ( ) & & ( ( settings . allow_single_quotes & & * buf . position ( ) = = ' \' ' ) | | ( settings . allow_double_quotes & & * buf . position ( ) = = ' " ' ) ) )
readCSVStringInto ( res , buf , settings ) ;
else
readCSVWithTwoPossibleDelimitersImpl ( res , buf , first_delimiter , second_delimiter ) ;
return res ;
}
String readCSVFieldWithTwoPossibleDelimiters ( PeekableReadBuffer & buf , const FormatSettings : : CSV & settings , const String & first_delimiter , const String & second_delimiter )
{
String res ;
/// If value is quoted, use regular CSV reading since we need to read only data inside quotes.
if ( ! buf . eof ( ) & & ( ( settings . allow_single_quotes & & * buf . position ( ) = = ' \' ' ) | | ( settings . allow_double_quotes & & * buf . position ( ) = = ' " ' ) ) )
readCSVField ( res , buf , settings ) ;
else
readCSVWithTwoPossibleDelimitersImpl ( res , buf , first_delimiter , second_delimiter ) ;
return res ;
}
2018-07-04 21:00:50 +00:00
template void readCSVStringInto < PaddedPODArray < UInt8 > > ( PaddedPODArray < UInt8 > & s , ReadBuffer & buf , const FormatSettings : : CSV & settings ) ;
2022-05-13 13:51:28 +00:00
template void readCSVStringInto < NullOutput > ( NullOutput & s , ReadBuffer & buf , const FormatSettings : : CSV & settings ) ;
2016-02-16 16:39:39 +00:00
2016-02-07 08:42:21 +00:00
2017-08-09 01:34:01 +00:00
template < typename Vector , typename ReturnType >
ReturnType readJSONStringInto ( Vector & s , ReadBuffer & buf )
2016-02-18 11:44:50 +00:00
{
2017-12-25 04:01:46 +00:00
static constexpr bool throw_exception = std : : is_same_v < ReturnType , void > ;
2017-08-09 01:34:01 +00:00
2018-07-16 06:09:27 +00:00
auto error = [ ] ( const char * message [[maybe_unused]], int code [[maybe_unused]] )
2017-08-09 01:34:01 +00:00
{
2018-07-14 23:39:00 +00:00
if constexpr ( throw_exception )
2020-12-10 17:26:36 +00:00
throw ParsingException ( message , code ) ;
2017-08-09 01:34:01 +00:00
return ReturnType ( false ) ;
} ;
2017-04-01 07:20:54 +00:00
if ( buf . eof ( ) | | * buf . position ( ) ! = ' " ' )
2017-08-09 01:34:01 +00:00
return error ( " Cannot parse JSON string: expected opening quote " , ErrorCodes : : CANNOT_PARSE_QUOTED_STRING ) ;
2017-04-01 07:20:54 +00:00
+ + buf . position ( ) ;
2016-02-18 11:44:50 +00:00
2017-04-01 07:20:54 +00:00
while ( ! buf . eof ( ) )
{
2018-08-25 15:32:55 +00:00
char * next_pos = find_first_symbols < ' \\ ' , ' " ' > ( buf . position ( ) , buf . buffer ( ) . end ( ) ) ;
2016-02-18 11:44:50 +00:00
Padding for IO buffers.
Testing data
```
select 'aaaaaaaa','bbbbbbbb','cccccccc','dddddddd','eeeeeeee','ffffffff','gggg','hhh' from numbers(3000000) into outfile '/tmp/test.tsv'
```
Testing command
```
echo "select count() from file('/tmp/test.tsv', CSV, 'a String, b String, c String, d String, e String, f String, g String, h String') where not ignore(e)" | clickhouse-benchmark
```
TSV parser has less overhead than CSV, using it would better unveil the benefits of memcpySmall.
Before
```
QPS: 1.662, RPS: 4985463.906, MiB/s: 603.823, result RPS: 1.662, result MiB/s: 0.000.
0.000% 0.559 sec.
10.000% 0.564 sec.
20.000% 0.568 sec.
30.000% 0.572 sec.
40.000% 0.575 sec.
50.000% 0.581 sec.
60.000% 0.592 sec.
70.000% 0.624 sec.
80.000% 0.639 sec.
90.000% 0.664 sec.
95.000% 0.686 sec.
99.000% 0.711 sec.
99.900% 0.715 sec.
99.990% 0.716 sec.
```
After
```
QPS: 1.861, RPS: 5582303.107, MiB/s: 676.110, result RPS: 1.861, result MiB/s: 0.000.
0.000% 0.510 sec.
10.000% 0.514 sec.
20.000% 0.517 sec.
30.000% 0.521 sec.
40.000% 0.523 sec.
50.000% 0.527 sec.
60.000% 0.530 sec.
70.000% 0.539 sec.
80.000% 0.558 sec.
90.000% 0.584 sec.
95.000% 0.589 sec.
99.000% 0.608 sec.
99.900% 0.655 sec.
99.990% 0.663 sec.
```
2018-08-27 19:14:15 +00:00
appendToStringOrVector ( s , buf , next_pos ) ;
2018-08-25 15:32:55 +00:00
buf . position ( ) = next_pos ;
2016-02-18 11:44:50 +00:00
2017-04-01 07:20:54 +00:00
if ( ! buf . hasPendingData ( ) )
continue ;
2016-02-18 11:44:50 +00:00
2017-04-01 07:20:54 +00:00
if ( * buf . position ( ) = = ' " ' )
{
+ + buf . position ( ) ;
2017-08-09 01:34:01 +00:00
return ReturnType ( true ) ;
2017-04-01 07:20:54 +00:00
}
2016-02-18 11:44:50 +00:00
2017-04-01 07:20:54 +00:00
if ( * buf . position ( ) = = ' \\ ' )
2017-08-09 01:34:01 +00:00
parseJSONEscapeSequence < Vector , ReturnType > ( s , buf ) ;
2017-04-01 07:20:54 +00:00
}
2016-02-18 11:44:50 +00:00
2017-08-09 01:34:01 +00:00
return error ( " Cannot parse JSON string: expected closing quote " , ErrorCodes : : CANNOT_PARSE_QUOTED_STRING ) ;
2016-02-18 11:44:50 +00:00
}
void readJSONString ( String & s , ReadBuffer & buf )
{
2017-04-01 07:20:54 +00:00
s . clear ( ) ;
readJSONStringInto ( s , buf ) ;
2016-02-18 11:44:50 +00:00
}
2017-08-09 01:34:01 +00:00
template void readJSONStringInto < PaddedPODArray < UInt8 > , void > ( PaddedPODArray < UInt8 > & s , ReadBuffer & buf ) ;
template bool readJSONStringInto < PaddedPODArray < UInt8 > , bool > ( PaddedPODArray < UInt8 > & s , ReadBuffer & buf ) ;
2020-10-29 17:22:48 +00:00
template void readJSONStringInto < NullOutput > ( NullOutput & s , ReadBuffer & buf ) ;
2018-09-14 12:15:32 +00:00
template void readJSONStringInto < String > ( String & s , ReadBuffer & buf ) ;
2022-12-07 21:19:27 +00:00
template bool readJSONStringInto < String , bool > ( String & s , ReadBuffer & buf ) ;
2016-02-18 11:44:50 +00:00
2021-04-23 12:53:38 +00:00
template < typename Vector , typename ReturnType >
ReturnType readJSONObjectPossiblyInvalid ( Vector & s , ReadBuffer & buf )
{
static constexpr bool throw_exception = std : : is_same_v < ReturnType , void > ;
auto error = [ ] ( const char * message [[maybe_unused]], int code [[maybe_unused]] )
{
if constexpr ( throw_exception )
throw ParsingException ( message , code ) ;
return ReturnType ( false ) ;
} ;
if ( buf . eof ( ) | | * buf . position ( ) ! = ' { ' )
2021-05-03 00:56:19 +00:00
return error ( " JSON should start from opening curly bracket " , ErrorCodes : : INCORRECT_DATA ) ;
2021-04-23 12:53:38 +00:00
s . push_back ( * buf . position ( ) ) ;
+ + buf . position ( ) ;
2021-09-09 21:16:57 +00:00
2021-04-23 12:53:38 +00:00
Int64 balance = 1 ;
2021-09-09 21:16:57 +00:00
bool quotes = false ;
2021-04-23 12:53:38 +00:00
while ( ! buf . eof ( ) )
{
2021-09-09 21:16:57 +00:00
char * next_pos = find_first_symbols < ' \\ ' , ' { ' , ' } ' , ' " ' > ( buf . position ( ) , buf . buffer ( ) . end ( ) ) ;
2021-04-23 12:53:38 +00:00
appendToStringOrVector ( s , buf , next_pos ) ;
buf . position ( ) = next_pos ;
if ( ! buf . hasPendingData ( ) )
continue ;
2021-09-09 21:16:57 +00:00
s . push_back ( * buf . position ( ) ) ;
2021-04-23 12:53:38 +00:00
if ( * buf . position ( ) = = ' \\ ' )
{
2021-09-09 21:16:57 +00:00
+ + buf . position ( ) ;
if ( ! buf . eof ( ) )
{
s . push_back ( * buf . position ( ) ) ;
+ + buf . position ( ) ;
}
2021-04-23 12:53:38 +00:00
continue ;
}
2021-09-09 21:16:57 +00:00
if ( * buf . position ( ) = = ' " ' )
quotes = ! quotes ;
else if ( ! quotes ) / / can be only ' { ' or ' } '
balance + = * buf . position ( ) = = ' { ' ? 1 : - 1 ;
2021-04-23 12:53:38 +00:00
+ + buf . position ( ) ;
2021-05-03 00:56:19 +00:00
2021-04-23 12:53:38 +00:00
if ( balance = = 0 )
return ReturnType ( true ) ;
2021-05-03 00:56:19 +00:00
2022-03-01 16:32:55 +00:00
if ( balance < 0 )
2021-05-03 00:56:19 +00:00
break ;
2021-04-23 12:53:38 +00:00
}
return error ( " JSON should have equal number of opening and closing brackets " , ErrorCodes : : INCORRECT_DATA ) ;
}
template void readJSONObjectPossiblyInvalid < String > ( String & s , ReadBuffer & buf ) ;
2016-02-18 11:44:50 +00:00
2018-07-11 21:43:09 +00:00
template < typename ReturnType >
ReturnType readDateTextFallback ( LocalDate & date , ReadBuffer & buf )
2017-11-15 02:08:55 +00:00
{
2018-07-11 21:43:09 +00:00
static constexpr bool throw_exception = std : : is_same_v < ReturnType , void > ;
2018-07-14 23:39:00 +00:00
auto error = [ ]
{
if constexpr ( throw_exception )
throw Exception ( " Cannot parse date: value is too short " , ErrorCodes : : CANNOT_PARSE_DATE ) ;
return ReturnType ( false ) ;
} ;
2017-11-15 02:08:55 +00:00
2018-07-14 23:39:00 +00:00
auto append_digit = [ & ] ( auto & x )
2017-11-15 02:08:55 +00:00
{
2018-07-14 23:39:00 +00:00
if ( ! buf . eof ( ) & & isNumericASCII ( * buf . position ( ) ) )
{
x = x * 10 + ( * buf . position ( ) - ' 0 ' ) ;
+ + buf . position ( ) ;
return true ;
}
else
return false ;
} ;
UInt16 year = 0 ;
2021-10-29 21:23:03 +00:00
UInt8 month = 0 ;
UInt8 day = 0 ;
2018-07-14 23:39:00 +00:00
if ( ! append_digit ( year )
2020-03-18 03:27:32 +00:00
| | ! append_digit ( year ) // NOLINT
| | ! append_digit ( year ) // NOLINT
| | ! append_digit ( year ) ) // NOLINT
2018-07-14 23:39:00 +00:00
return error ( ) ;
2021-10-29 21:23:03 +00:00
if ( buf . eof ( ) )
2018-07-14 23:39:00 +00:00
return error ( ) ;
2021-10-29 21:23:03 +00:00
if ( isNumericASCII ( * buf . position ( ) ) )
{
/// YYYYMMDD
if ( ! append_digit ( month )
| | ! append_digit ( month ) // NOLINT
| | ! append_digit ( day )
| | ! append_digit ( day ) ) // NOLINT
return error ( ) ;
}
else
{
+ + buf . position ( ) ;
2018-07-14 23:39:00 +00:00
2021-10-29 21:23:03 +00:00
if ( ! append_digit ( month ) )
return error ( ) ;
append_digit ( month ) ;
2018-07-14 23:39:00 +00:00
2021-10-29 21:23:03 +00:00
if ( ! buf . eof ( ) & & ! isNumericASCII ( * buf . position ( ) ) )
+ + buf . position ( ) ;
else
return error ( ) ;
if ( ! append_digit ( day ) )
return error ( ) ;
append_digit ( day ) ;
}
2017-11-15 02:08:55 +00:00
date = LocalDate ( year , month , day ) ;
2018-07-11 21:43:09 +00:00
return ReturnType ( true ) ;
2017-11-15 02:08:55 +00:00
}
2018-07-11 21:43:09 +00:00
template void readDateTextFallback < void > ( LocalDate & , ReadBuffer & ) ;
template bool readDateTextFallback < bool > ( LocalDate & , ReadBuffer & ) ;
2017-11-15 02:08:55 +00:00
2018-07-11 21:43:09 +00:00
template < typename ReturnType >
ReturnType readDateTimeTextFallback ( time_t & datetime , ReadBuffer & buf , const DateLUTImpl & date_lut )
2015-04-01 02:55:52 +00:00
{
2018-07-11 21:43:09 +00:00
static constexpr bool throw_exception = std : : is_same_v < ReturnType , void > ;
2020-10-16 21:31:29 +00:00
/// YYYY-MM-DD
static constexpr auto date_broken_down_length = 10 ;
2022-08-21 21:12:41 +00:00
/// hh:mm:ss
static constexpr auto time_broken_down_length = 8 ;
/// YYYY-MM-DD hh:mm:ss
static constexpr auto date_time_broken_down_length = date_broken_down_length + 1 + time_broken_down_length ;
2017-04-01 07:20:54 +00:00
2020-03-23 02:12:31 +00:00
char s [ date_time_broken_down_length ] ;
2017-04-01 07:20:54 +00:00
char * s_pos = s ;
2021-02-21 12:34:04 +00:00
/** Read characters, that could represent unix timestamp.
* Only unix timestamp of at least 5 characters is supported .
* Then look at 5 th character . If it is a number - treat whole as unix timestamp .
* If it is not a number - then parse datetime in YYYY - MM - DD hh : mm : ss or YYYY - MM - DD format .
*/
/// A piece similar to unix timestamp, maybe scaled to subsecond precision.
while ( s_pos < s + date_time_broken_down_length & & ! buf . eof ( ) & & isNumericASCII ( * buf . position ( ) ) )
2017-04-01 07:20:54 +00:00
{
* s_pos = * buf . position ( ) ;
+ + s_pos ;
+ + buf . position ( ) ;
}
2020-10-16 21:31:29 +00:00
/// 2015-01-01 01:02:03 or 2015-01-01
2021-02-21 12:34:04 +00:00
if ( s_pos = = s + 4 & & ! buf . eof ( ) & & ! isNumericASCII ( * buf . position ( ) ) )
2017-04-01 07:20:54 +00:00
{
2020-10-16 21:31:29 +00:00
const auto already_read_length = s_pos - s ;
const size_t remaining_date_size = date_broken_down_length - already_read_length ;
2022-08-21 21:12:41 +00:00
size_t size = buf . read ( s_pos , remaining_date_size ) ;
if ( size ! = remaining_date_size )
2017-04-01 07:20:54 +00:00
{
s_pos [ size ] = 0 ;
2018-07-11 21:43:09 +00:00
if constexpr ( throw_exception )
2022-08-21 21:12:41 +00:00
throw ParsingException ( std : : string ( " Cannot parse DateTime " ) + s , ErrorCodes : : CANNOT_PARSE_DATETIME ) ;
2018-07-11 21:43:09 +00:00
else
return false ;
2017-04-01 07:20:54 +00:00
}
UInt16 year = ( s [ 0 ] - ' 0 ' ) * 1000 + ( s [ 1 ] - ' 0 ' ) * 100 + ( s [ 2 ] - ' 0 ' ) * 10 + ( s [ 3 ] - ' 0 ' ) ;
UInt8 month = ( s [ 5 ] - ' 0 ' ) * 10 + ( s [ 6 ] - ' 0 ' ) ;
UInt8 day = ( s [ 8 ] - ' 0 ' ) * 10 + ( s [ 9 ] - ' 0 ' ) ;
2020-10-16 21:31:29 +00:00
UInt8 hour = 0 ;
UInt8 minute = 0 ;
UInt8 second = 0 ;
2022-08-21 21:12:41 +00:00
if ( ! buf . eof ( ) & & ( * buf . position ( ) = = ' ' | | * buf . position ( ) = = ' T ' ) )
2020-10-16 21:31:29 +00:00
{
2022-08-21 21:12:41 +00:00
+ + buf . position ( ) ;
size = buf . read ( s , time_broken_down_length ) ;
if ( size ! = time_broken_down_length )
{
s_pos [ size ] = 0 ;
if constexpr ( throw_exception )
throw ParsingException ( std : : string ( " Cannot parse time component of DateTime " ) + s , ErrorCodes : : CANNOT_PARSE_DATETIME ) ;
else
return false ;
}
hour = ( s [ 0 ] - ' 0 ' ) * 10 + ( s [ 1 ] - ' 0 ' ) ;
minute = ( s [ 3 ] - ' 0 ' ) * 10 + ( s [ 4 ] - ' 0 ' ) ;
second = ( s [ 6 ] - ' 0 ' ) * 10 + ( s [ 7 ] - ' 0 ' ) ;
2020-10-16 21:31:29 +00:00
}
2017-04-01 07:20:54 +00:00
if ( unlikely ( year = = 0 ) )
datetime = 0 ;
else
datetime = date_lut . makeDateTime ( year , month , day , hour , minute , second ) ;
}
else
2018-07-14 23:39:00 +00:00
{
2021-02-21 12:34:04 +00:00
if ( s_pos - s > = 5 )
2018-07-14 23:39:00 +00:00
{
/// Not very efficient.
datetime = 0 ;
for ( const char * digit_pos = s ; digit_pos < s_pos ; + + digit_pos )
datetime = datetime * 10 + * digit_pos - ' 0 ' ;
}
else
{
if constexpr ( throw_exception )
2020-12-10 17:26:36 +00:00
throw ParsingException ( " Cannot parse datetime " , ErrorCodes : : CANNOT_PARSE_DATETIME ) ;
2018-07-14 23:39:00 +00:00
else
return false ;
}
}
2018-07-11 21:43:09 +00:00
return ReturnType ( true ) ;
2015-04-01 02:55:52 +00:00
}
2018-07-11 21:43:09 +00:00
template void readDateTimeTextFallback < void > ( time_t & , ReadBuffer & , const DateLUTImpl & ) ;
template bool readDateTimeTextFallback < bool > ( time_t & , ReadBuffer & , const DateLUTImpl & ) ;
2015-04-01 02:55:52 +00:00
2022-07-15 11:15:46 +00:00
void skipJSONField ( ReadBuffer & buf , StringRef name_of_field )
2016-09-20 19:11:25 +00:00
{
2017-04-01 07:20:54 +00:00
if ( buf . eof ( ) )
2018-08-27 14:54:58 +00:00
throw Exception ( " Unexpected EOF for key ' " + name_of_field . toString ( ) + " ' " , ErrorCodes : : INCORRECT_DATA ) ;
2017-04-01 07:20:54 +00:00
else if ( * buf . position ( ) = = ' " ' ) /// skip double-quoted string
{
2020-10-29 17:22:48 +00:00
NullOutput sink ;
2017-04-01 07:20:54 +00:00
readJSONStringInto ( sink , buf ) ;
}
2018-01-14 00:12:23 +00:00
else if ( isNumericASCII ( * buf . position ( ) ) | | * buf . position ( ) = = ' - ' | | * buf . position ( ) = = ' + ' | | * buf . position ( ) = = ' . ' ) /// skip number
2017-04-01 07:20:54 +00:00
{
2018-01-14 00:12:23 +00:00
if ( * buf . position ( ) = = ' + ' )
+ + buf . position ( ) ;
2017-04-01 07:20:54 +00:00
double v ;
if ( ! tryReadFloatText ( v , buf ) )
2018-08-27 14:54:58 +00:00
throw Exception ( " Expected a number field for key ' " + name_of_field . toString ( ) + " ' " , ErrorCodes : : INCORRECT_DATA ) ;
2017-04-01 07:20:54 +00:00
}
else if ( * buf . position ( ) = = ' n ' ) /// skip null
{
assertString ( " null " , buf ) ;
}
else if ( * buf . position ( ) = = ' t ' ) /// skip true
{
assertString ( " true " , buf ) ;
}
else if ( * buf . position ( ) = = ' f ' ) /// skip false
{
assertString ( " false " , buf ) ;
}
else if ( * buf . position ( ) = = ' [ ' )
{
+ + buf . position ( ) ;
skipWhitespaceIfAny ( buf ) ;
if ( ! buf . eof ( ) & & * buf . position ( ) = = ' ] ' ) /// skip empty array
{
+ + buf . position ( ) ;
return ;
}
while ( true )
{
2018-08-27 14:54:58 +00:00
skipJSONField ( buf , name_of_field ) ;
2017-04-01 07:20:54 +00:00
skipWhitespaceIfAny ( buf ) ;
if ( ! buf . eof ( ) & & * buf . position ( ) = = ' , ' )
{
+ + buf . position ( ) ;
skipWhitespaceIfAny ( buf ) ;
}
else if ( ! buf . eof ( ) & & * buf . position ( ) = = ' ] ' )
{
+ + buf . position ( ) ;
break ;
}
else
2018-08-27 14:54:58 +00:00
throw Exception ( " Unexpected symbol for key ' " + name_of_field . toString ( ) + " ' " , ErrorCodes : : INCORRECT_DATA ) ;
2017-04-01 07:20:54 +00:00
}
}
2018-08-23 11:00:04 +00:00
else if ( * buf . position ( ) = = ' { ' ) /// skip whole object
2017-04-01 07:20:54 +00:00
{
2018-08-23 11:00:04 +00:00
+ + buf . position ( ) ;
skipWhitespaceIfAny ( buf ) ;
while ( ! buf . eof ( ) & & * buf . position ( ) ! = ' } ' )
{
// field name
2018-08-27 14:54:58 +00:00
if ( * buf . position ( ) = = ' " ' )
{
2020-10-29 17:22:48 +00:00
NullOutput sink ;
2018-08-23 11:00:04 +00:00
readJSONStringInto ( sink , buf ) ;
2018-08-27 19:06:32 +00:00
}
2018-08-27 14:54:58 +00:00
else
throw Exception ( " Unexpected symbol for key ' " + name_of_field . toString ( ) + " ' " , ErrorCodes : : INCORRECT_DATA ) ;
2018-08-23 11:00:04 +00:00
// ':'
skipWhitespaceIfAny ( buf ) ;
if ( buf . eof ( ) | | ! ( * buf . position ( ) = = ' : ' ) )
2018-08-27 14:54:58 +00:00
throw Exception ( " Unexpected symbol for key ' " + name_of_field . toString ( ) + " ' " , ErrorCodes : : INCORRECT_DATA ) ;
2018-08-23 11:00:04 +00:00
+ + buf . position ( ) ;
skipWhitespaceIfAny ( buf ) ;
2018-08-27 14:54:58 +00:00
skipJSONField ( buf , name_of_field ) ;
2018-08-23 11:00:04 +00:00
skipWhitespaceIfAny ( buf ) ;
// optional ','
2018-08-27 14:54:58 +00:00
if ( ! buf . eof ( ) & & * buf . position ( ) = = ' , ' )
{
2018-08-23 11:00:04 +00:00
+ + buf . position ( ) ;
skipWhitespaceIfAny ( buf ) ;
}
}
if ( buf . eof ( ) )
2018-08-27 14:54:58 +00:00
throw Exception ( " Unexpected EOF for key ' " + name_of_field . toString ( ) + " ' " , ErrorCodes : : INCORRECT_DATA ) ;
2018-08-23 11:00:04 +00:00
+ + buf . position ( ) ;
2017-04-01 07:20:54 +00:00
}
else
{
2018-08-27 14:54:58 +00:00
throw Exception ( " Unexpected symbol ' " + std : : string ( * buf . position ( ) , 1 ) + " ' for key ' " + name_of_field . toString ( ) + " ' " , ErrorCodes : : INCORRECT_DATA ) ;
2017-04-01 07:20:54 +00:00
}
2016-09-20 19:11:25 +00:00
}
2021-01-19 21:42:31 +00:00
Exception readException ( ReadBuffer & buf , const String & additional_message , bool remote_exception )
2012-05-08 05:42:05 +00:00
{
2017-04-01 07:20:54 +00:00
int code = 0 ;
String name ;
String message ;
String stack_trace ;
2020-01-02 06:56:53 +00:00
bool has_nested = false ; /// Obsolete
2015-04-01 02:55:52 +00:00
2017-04-01 07:20:54 +00:00
readBinary ( code , buf ) ;
readBinary ( name , buf ) ;
readBinary ( message , buf ) ;
readBinary ( stack_trace , buf ) ;
readBinary ( has_nested , buf ) ;
2012-05-08 05:42:05 +00:00
2017-07-31 21:39:24 +00:00
WriteBufferFromOwnString out ;
2012-05-08 05:42:05 +00:00
2017-07-31 21:39:24 +00:00
if ( ! additional_message . empty ( ) )
out < < additional_message < < " . " ;
2012-05-08 05:42:05 +00:00
2017-07-31 21:39:24 +00:00
if ( name ! = " DB::Exception " )
out < < name < < " . " ;
2012-05-08 05:42:05 +00:00
2018-08-24 07:35:53 +00:00
out < < message < < " . " ;
if ( ! stack_trace . empty ( ) )
out < < " Stack trace: \n \n " < < stack_trace ;
2012-05-08 05:42:05 +00:00
2021-01-19 21:42:31 +00:00
return Exception ( out . str ( ) , code , remote_exception ) ;
2012-05-08 05:42:05 +00:00
}
void readAndThrowException ( ReadBuffer & buf , const String & additional_message )
{
2020-01-05 23:00:31 +00:00
readException ( buf , additional_message ) . rethrow ( ) ;
2012-05-08 05:42:05 +00:00
}
2016-08-16 21:23:53 +00:00
2021-02-19 12:51:26 +00:00
void skipToCarriageReturnOrEOF ( ReadBuffer & buf )
{
while ( ! buf . eof ( ) )
{
char * next_pos = find_first_symbols < ' \r ' > ( buf . position ( ) , buf . buffer ( ) . end ( ) ) ;
buf . position ( ) = next_pos ;
if ( ! buf . hasPendingData ( ) )
continue ;
if ( * buf . position ( ) = = ' \r ' )
{
+ + buf . position ( ) ;
return ;
}
}
}
2017-01-27 04:29:47 +00:00
void skipToNextLineOrEOF ( ReadBuffer & buf )
{
2017-04-01 07:20:54 +00:00
while ( ! buf . eof ( ) )
{
2018-08-25 15:32:55 +00:00
char * next_pos = find_first_symbols < ' \n ' > ( buf . position ( ) , buf . buffer ( ) . end ( ) ) ;
buf . position ( ) = next_pos ;
2017-01-27 04:29:47 +00:00
2017-04-01 07:20:54 +00:00
if ( ! buf . hasPendingData ( ) )
continue ;
2017-01-27 04:29:47 +00:00
2017-04-01 07:20:54 +00:00
if ( * buf . position ( ) = = ' \n ' )
{
+ + buf . position ( ) ;
return ;
}
}
2017-01-27 04:29:47 +00:00
}
void skipToUnescapedNextLineOrEOF ( ReadBuffer & buf )
{
2017-04-01 07:20:54 +00:00
while ( ! buf . eof ( ) )
{
2018-08-25 15:32:55 +00:00
char * next_pos = find_first_symbols < ' \n ' , ' \\ ' > ( buf . position ( ) , buf . buffer ( ) . end ( ) ) ;
buf . position ( ) = next_pos ;
2017-04-01 07:20:54 +00:00
if ( ! buf . hasPendingData ( ) )
continue ;
if ( * buf . position ( ) = = ' \n ' )
{
+ + buf . position ( ) ;
return ;
}
if ( * buf . position ( ) = = ' \\ ' )
{
+ + buf . position ( ) ;
if ( buf . eof ( ) )
return ;
2019-01-22 19:56:53 +00:00
/// Skip escaped character. We do not consider escape sequences with more than one character after backslash (\x01).
2017-04-01 07:20:54 +00:00
/// It's ok for the purpose of this function, because we are interested only in \n and \\.
+ + buf . position ( ) ;
continue ;
}
}
2017-01-27 04:29:47 +00:00
}
2022-11-10 20:15:14 +00:00
void skipNullTerminated ( ReadBuffer & buf )
{
while ( ! buf . eof ( ) )
{
char * next_pos = find_first_symbols < ' \0 ' > ( buf . position ( ) , buf . buffer ( ) . end ( ) ) ;
buf . position ( ) = next_pos ;
if ( ! buf . hasPendingData ( ) )
continue ;
if ( * buf . position ( ) = = ' \0 ' )
{
+ + buf . position ( ) ;
return ;
}
}
}
2021-10-16 08:28:10 +00:00
void saveUpToPosition ( ReadBuffer & in , Memory < > & memory , char * current )
2019-11-22 13:53:26 +00:00
{
assert ( current > = in . position ( ) ) ;
assert ( current < = in . buffer ( ) . end ( ) ) ;
2021-02-10 14:16:27 +00:00
const size_t old_bytes = memory . size ( ) ;
const size_t additional_bytes = current - in . position ( ) ;
const size_t new_bytes = old_bytes + additional_bytes ;
2021-07-14 11:59:06 +00:00
2019-11-22 13:53:26 +00:00
/// There are no new bytes to add to memory.
/// No need to do extra stuff.
if ( new_bytes = = 0 )
return ;
2021-07-14 11:59:06 +00:00
assert ( in . position ( ) + additional_bytes < = in . buffer ( ) . end ( ) ) ;
2019-11-22 13:53:26 +00:00
memory . resize ( new_bytes ) ;
memcpy ( memory . data ( ) + old_bytes , in . position ( ) , additional_bytes ) ;
in . position ( ) = current ;
}
2021-10-16 08:28:10 +00:00
bool loadAtPosition ( ReadBuffer & in , Memory < > & memory , char * & current )
2019-11-22 13:53:26 +00:00
{
assert ( current < = in . buffer ( ) . end ( ) ) ;
if ( current < in . buffer ( ) . end ( ) )
return true ;
saveUpToPosition ( in , memory , current ) ;
2020-07-31 14:53:41 +00:00
2019-11-22 13:53:26 +00:00
bool loaded_more = ! in . eof ( ) ;
2020-07-31 14:53:41 +00:00
// A sanity check. Buffer position may be in the beginning of the buffer
// (normal case), or have some offset from it (AIO).
assert ( in . position ( ) > = in . buffer ( ) . begin ( ) ) ;
assert ( in . position ( ) < = in . buffer ( ) . end ( ) ) ;
2019-11-22 13:53:26 +00:00
current = in . position ( ) ;
2020-07-31 14:53:41 +00:00
2019-11-22 13:53:26 +00:00
return loaded_more ;
}
2021-11-09 13:14:07 +00:00
/// Searches for delimiter in input stream and sets buffer position after delimiter (if found) or EOF (if not)
static void findAndSkipNextDelimiter ( PeekableReadBuffer & buf , const String & delimiter )
{
if ( delimiter . empty ( ) )
return ;
while ( ! buf . eof ( ) )
{
void * pos = memchr ( buf . position ( ) , delimiter [ 0 ] , buf . available ( ) ) ;
if ( ! pos )
{
buf . position ( ) + = buf . available ( ) ;
continue ;
}
buf . position ( ) = static_cast < ReadBuffer : : Position > ( pos ) ;
PeekableReadBufferCheckpoint checkpoint { buf } ;
if ( checkString ( delimiter , buf ) )
return ;
buf . rollbackToCheckpoint ( ) ;
+ + buf . position ( ) ;
}
}
void skipToNextRowOrEof ( PeekableReadBuffer & buf , const String & row_after_delimiter , const String & row_between_delimiter , bool skip_spaces )
{
if ( row_after_delimiter . empty ( ) )
{
findAndSkipNextDelimiter ( buf , row_between_delimiter ) ;
return ;
}
while ( true )
{
findAndSkipNextDelimiter ( buf , row_after_delimiter ) ;
if ( skip_spaces )
skipWhitespaceIfAny ( buf ) ;
if ( checkString ( row_between_delimiter , buf ) )
break ;
}
}
2021-12-15 11:30:57 +00:00
// Use PeekableReadBuffer to copy field to string after parsing.
2022-05-13 13:51:28 +00:00
template < typename Vector , typename ParseFunc >
static void readParsedValueInto ( Vector & s , ReadBuffer & buf , ParseFunc parse_func )
2021-12-15 11:30:57 +00:00
{
PeekableReadBuffer peekable_buf ( buf ) ;
peekable_buf . setCheckpoint ( ) ;
parse_func ( peekable_buf ) ;
peekable_buf . makeContinuousMemoryFromCheckpointToPos ( ) ;
auto * end = peekable_buf . position ( ) ;
peekable_buf . rollbackToCheckpoint ( ) ;
s . append ( peekable_buf . position ( ) , end ) ;
peekable_buf . position ( ) = end ;
}
2021-12-03 13:25:35 +00:00
2022-12-07 21:19:27 +00:00
template < typename Vector >
static void readQuotedStringFieldInto ( Vector & s , ReadBuffer & buf )
{
assertChar ( ' \' ' , buf ) ;
s . push_back ( ' \' ' ) ;
while ( ! buf . eof ( ) )
{
char * next_pos = find_first_symbols < ' \\ ' , ' \' ' > ( buf . position ( ) , buf . buffer ( ) . end ( ) ) ;
s . append ( buf . position ( ) , next_pos ) ;
buf . position ( ) = next_pos ;
if ( ! buf . hasPendingData ( ) )
continue ;
if ( * buf . position ( ) = = ' \' ' )
break ;
s . push_back ( * buf . position ( ) ) ;
if ( * buf . position ( ) = = ' \\ ' )
{
+ + buf . position ( ) ;
if ( ! buf . eof ( ) )
{
s . push_back ( * buf . position ( ) ) ;
+ + buf . position ( ) ;
}
}
}
2022-12-14 21:17:00 +00:00
if ( buf . eof ( ) )
return ;
2022-12-07 21:19:27 +00:00
+ + buf . position ( ) ;
s . push_back ( ' \' ' ) ;
}
2022-05-13 13:51:28 +00:00
template < char opening_bracket , char closing_bracket , typename Vector >
static void readQuotedFieldInBracketsInto ( Vector & s , ReadBuffer & buf )
2021-12-03 13:25:35 +00:00
{
assertChar ( opening_bracket , buf ) ;
s . push_back ( opening_bracket ) ;
size_t balance = 1 ;
while ( ! buf . eof ( ) & & balance )
{
char * next_pos = find_first_symbols < ' \' ' , opening_bracket , closing_bracket > ( buf . position ( ) , buf . buffer ( ) . end ( ) ) ;
appendToStringOrVector ( s , buf , next_pos ) ;
buf . position ( ) = next_pos ;
if ( ! buf . hasPendingData ( ) )
continue ;
if ( * buf . position ( ) = = ' \' ' )
{
2022-12-07 21:19:27 +00:00
readQuotedStringFieldInto ( s , buf ) ;
2021-12-03 13:25:35 +00:00
}
else if ( * buf . position ( ) = = opening_bracket )
{
2022-12-07 21:19:27 +00:00
s . push_back ( opening_bracket ) ;
2021-12-03 13:25:35 +00:00
+ + balance ;
+ + buf . position ( ) ;
}
else if ( * buf . position ( ) = = closing_bracket )
{
2022-12-07 21:19:27 +00:00
s . push_back ( closing_bracket ) ;
2021-12-03 13:25:35 +00:00
- - balance ;
+ + buf . position ( ) ;
}
}
}
2022-05-13 13:51:28 +00:00
template < typename Vector >
void readQuotedFieldInto ( Vector & s , ReadBuffer & buf )
2021-12-03 13:25:35 +00:00
{
if ( buf . eof ( ) )
return ;
/// Possible values in 'Quoted' field:
/// - Strings: '...'
/// - Arrays: [...]
/// - Tuples: (...)
/// - Maps: {...}
/// - NULL
2022-03-24 12:54:12 +00:00
/// - Bool: true/false
2021-12-03 13:25:35 +00:00
/// - Number: integer, float, decimal.
if ( * buf . position ( ) = = ' \' ' )
2022-12-07 21:19:27 +00:00
readQuotedStringFieldInto ( s , buf ) ;
2021-12-03 13:25:35 +00:00
else if ( * buf . position ( ) = = ' [ ' )
2022-05-13 13:51:28 +00:00
readQuotedFieldInBracketsInto < ' [ ' , ' ] ' > ( s , buf ) ;
2021-12-03 13:25:35 +00:00
else if ( * buf . position ( ) = = ' ( ' )
2022-05-13 13:51:28 +00:00
readQuotedFieldInBracketsInto < ' ( ' , ' ) ' > ( s , buf ) ;
2021-12-03 13:25:35 +00:00
else if ( * buf . position ( ) = = ' { ' )
2022-05-13 13:51:28 +00:00
readQuotedFieldInBracketsInto < ' { ' , ' } ' > ( s , buf ) ;
2021-12-03 13:25:35 +00:00
else if ( checkCharCaseInsensitive ( ' n ' , buf ) )
{
/// NULL or NaN
if ( checkCharCaseInsensitive ( ' u ' , buf ) )
{
assertStringCaseInsensitive ( " ll " , buf ) ;
s . append ( " NULL " ) ;
}
else
{
assertStringCaseInsensitive ( " an " , buf ) ;
s . append ( " NaN " ) ;
}
}
2022-03-24 12:54:12 +00:00
else if ( checkCharCaseInsensitive ( ' t ' , buf ) )
{
assertStringCaseInsensitive ( " rue " , buf ) ;
s . append ( " true " ) ;
}
else if ( checkCharCaseInsensitive ( ' f ' , buf ) )
{
assertStringCaseInsensitive ( " alse " , buf ) ;
s . append ( " false " ) ;
}
2021-12-03 13:25:35 +00:00
else
{
/// It's an integer, float or decimal. They all can be parsed as float.
2021-12-15 11:30:57 +00:00
auto parse_func = [ ] ( ReadBuffer & in )
{
Float64 tmp ;
readFloatText ( tmp , in ) ;
} ;
2022-05-13 13:51:28 +00:00
readParsedValueInto ( s , buf , parse_func ) ;
2021-12-03 13:25:35 +00:00
}
}
2022-05-13 15:08:02 +00:00
template void readQuotedFieldInto < NullOutput > ( NullOutput & s , ReadBuffer & buf ) ;
2022-05-13 13:51:28 +00:00
void readQuotedField ( String & s , ReadBuffer & buf )
{
s . clear ( ) ;
readQuotedFieldInto ( s , buf ) ;
}
2022-05-06 16:48:48 +00:00
void readJSONField ( String & s , ReadBuffer & buf )
2021-12-15 11:30:57 +00:00
{
2022-05-06 16:48:48 +00:00
s . clear ( ) ;
2021-12-15 11:30:57 +00:00
auto parse_func = [ ] ( ReadBuffer & in ) { skipJSONField ( in , " json_field " ) ; } ;
2022-05-13 13:51:28 +00:00
readParsedValueInto ( s , buf , parse_func ) ;
2021-12-15 11:30:57 +00:00
}
2021-12-03 13:25:35 +00:00
2010-06-04 18:25:25 +00:00
}