Improve skiping unknown fields with Quoted escaping rule in Template/CustomSeparated formats

This commit is contained in:
avogar 2021-12-03 16:25:35 +03:00
parent 049b2c0c14
commit 7549619b25
6 changed files with 184 additions and 5 deletions

View File

@ -69,10 +69,7 @@ void skipFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule esca
readEscapedString(tmp, buf);
break;
case FormatSettings::EscapingRule::Quoted:
/// FIXME: it skips only strings, not numbers, arrays or tuples.
/// we should read until delimiter and skip all data between
/// single quotes.
readQuotedString(tmp, buf);
readQuotedFieldIntoString(tmp, buf);
break;
case FormatSettings::EscapingRule::CSV:
readCSVString(tmp, buf, format_settings.csv);

View File

@ -1212,4 +1212,96 @@ void skipToNextRowOrEof(PeekableReadBuffer & buf, const String & row_after_delim
}
}
template <char opening_bracket, char closing_bracket>
static void readQuotedFieldInBrackets(String & s, ReadBuffer & buf)
{
assertChar(opening_bracket, buf);
s.push_back(opening_bracket);
size_t balance = 1;
while (!buf.eof() && balance)
{
char * next_pos = find_first_symbols<'\'', opening_bracket, closing_bracket>(buf.position(), buf.buffer().end());
appendToStringOrVector(s, buf, next_pos);
buf.position() = next_pos;
if (!buf.hasPendingData())
continue;
s.push_back(*buf.position());
if (*buf.position() == '\'')
{
readQuotedStringInto<false>(s, buf);
s.push_back('\'');
}
else if (*buf.position() == opening_bracket)
{
++balance;
++buf.position();
}
else if (*buf.position() == closing_bracket)
{
--balance;
++buf.position();
}
}
}
void readQuotedFieldIntoString(String & s, ReadBuffer & buf)
{
s.clear();
if (buf.eof())
return;
/// Possible values in 'Quoted' field:
/// - Strings: '...'
/// - Arrays: [...]
/// - Tuples: (...)
/// - Maps: {...}
/// - NULL
/// - Number: integer, float, decimal.
if (*buf.position() == '\'')
readQuotedString(s, buf);
else if (*buf.position() == '[')
readQuotedFieldInBrackets<'[', ']'>(s, buf);
else if (*buf.position() == '(')
readQuotedFieldInBrackets<'(', ')'>(s, buf);
else if (*buf.position() == '{')
readQuotedFieldInBrackets<'{', '}'>(s, buf);
else if (checkCharCaseInsensitive('n', buf))
{
/// NULL or NaN
if (checkCharCaseInsensitive('u', buf))
{
assertStringCaseInsensitive("ll", buf);
s.append("NULL");
}
else
{
assertStringCaseInsensitive("an", buf);
s.append("NaN");
}
}
else
{
/// It's an integer, float or decimal. They all can be parsed as float.
/// Use PeekableReadBuffer to copy field to string after parsing.
PeekableReadBuffer peekable_buf(buf);
peekable_buf.setCheckpoint();
Float64 tmp;
readFloatText(tmp, peekable_buf);
peekable_buf.makeContinuousMemoryFromCheckpointToPos();
auto * end = peekable_buf.position();
peekable_buf.rollbackToCheckpoint();
s.append(peekable_buf.position(), end);
peekable_buf.position() = end;
}
}
}

View File

@ -184,6 +184,15 @@ inline void assertChar(char symbol, ReadBuffer & buf)
}
}
inline bool checkCharCaseInsensitive(char c, ReadBuffer & buf)
{
char a;
if (!buf.peek(a) || !equalsCaseInsensitive(a, c))
return false;
buf.ignore();
return true;
}
inline void assertString(const String & s, ReadBuffer & buf)
{
assertString(s.c_str(), buf);
@ -1375,4 +1384,6 @@ struct PcgDeserializer
}
};
void readQuotedFieldIntoString(String & s, ReadBuffer & buf);
}

View File

@ -205,7 +205,7 @@ void CustomSeparatedRowInputFormat::syncAfterError()
bool CustomSeparatedRowInputFormat::parseRowStartWithDiagnosticInfo(WriteBuffer & out)
{
return parseDelimiterWithDiagnosticInfo(out, buf, format_settings.custom.row_before_delimiter, "delimiter before first firld", ignore_spaces);
return parseDelimiterWithDiagnosticInfo(out, buf, format_settings.custom.row_before_delimiter, "delimiter before first field", ignore_spaces);
}
bool CustomSeparatedRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out)

View File

@ -0,0 +1,26 @@
1 42
2 42
3 42
4 42
5 42
6 42
7 42
8 42
9 42
10 42
11 42
12 42
13 42
14 42
15 42
16 42
17 42
18 42
19 42
20 42
21 42
22 42
23 42
24 42
25 42
26 42

View File

@ -0,0 +1,53 @@
#!/usr/bin/env bash
# Tags: no-parallel
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
$CLICKHOUSE_CLIENT -q "drop table if exists test_02129"
$CLICKHOUSE_CLIENT -q "create table test_02129 (x UInt64, y UInt64) engine=Memory()"
QUERY="insert into test_02129 format CustomSeparatedWithNames settings input_format_skip_unknown_fields=1, format_custom_escaping_rule='Quoted'"
# Skip string
echo -e "'x'\t'trash'\t'y'\n1\t'Some string'\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
# Skip number
echo -e "'x'\t'trash'\t'y'\n2\t42\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
echo -e "'x'\t'trash'\t'y'\n3\t4242.4242\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
echo -e "'x'\t'trash'\t'y'\n4\t-42\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
echo -e "'x'\t'trash'\t'y'\n5\t+42\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
echo -e "'x'\t'trash'\t'y'\n6\t-4242.424242\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
echo -e "'x'\t'trash'\t'y'\n7\t+4242.424242\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
echo -e "'x'\t'trash'\t'y'\n8\tnan\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
echo -e "'x'\t'trash'\t'y'\n9\tinf\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
echo -e "'x'\t'trash'\t'y'\n10\t+nan\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
echo -e "'x'\t'trash'\t'y'\n11\t+inf\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
echo -e "'x'\t'trash'\t'y'\n12\t-nan\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
echo -e "'x'\t'trash'\t'y'\n13\t-inf\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
echo -e "'x'\t'trash'\t'y'\n14\t44444444444444444444444444.444444444444444444444444\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
echo -e "'x'\t'trash'\t'y'\n15\t30e30\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
echo -e "'x'\t'trash'\t'y'\n16\t-30e-30\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
# Skip NULL
echo -e "'x'\t'trash'\t'y'\n17\tNULL\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
# Skip an array
echo -e "'x'\t'trash'\t'y'\n18\t[1,2,3,4]\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
echo -e "'x'\t'trash'\t'y'\n19\t['some string ]][[][][]', 'one more string (){}][[{[[[[[[']\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
echo -e "'x'\t'trash'\t'y'\n20\t[[(1,2), (3,4)], [(5,6), (7,8)]]\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
# Skip a tuple
echo -e "'x'\t'trash'\t'y'\n21\t(1,2,3,4)\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
echo -e "'x'\t'trash'\t'y'\n22\t('some string ()))))(()(())', 'one more string (){}][[{[)))))')\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
echo -e "'x'\t'trash'\t'y'\n23\t(([1,2], (3,4)), ([5,6], (7,8)))\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
# Skip a map
echo -e "'x'\t'trash'\t'y'\n24\t{1:2,2:3,3:4,4:5}\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
echo -e "'x'\t'trash'\t'y'\n25\t{'some string }}}}}}{{{{':123, 'one more string (){}][[{[{{{{{':123}\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
echo -e "'x'\t'trash'\t'y'\n26\t{'key':{1:(1,2), 2:(3,4)}, 'foo':{1:(5,6), 2:(7,8)}}\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
$CLICKHOUSE_CLIENT -q "select * from test_02129 order by x"
$CLICKHOUSE_CLIENT -q "drop table test_02129"