mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-22 15:42:02 +00:00
Improve skiping unknown fields with Quoted escaping rule in Template/CustomSeparated formats
This commit is contained in:
parent
049b2c0c14
commit
7549619b25
@ -69,10 +69,7 @@ void skipFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule esca
|
||||
readEscapedString(tmp, buf);
|
||||
break;
|
||||
case FormatSettings::EscapingRule::Quoted:
|
||||
/// FIXME: it skips only strings, not numbers, arrays or tuples.
|
||||
/// we should read until delimiter and skip all data between
|
||||
/// single quotes.
|
||||
readQuotedString(tmp, buf);
|
||||
readQuotedFieldIntoString(tmp, buf);
|
||||
break;
|
||||
case FormatSettings::EscapingRule::CSV:
|
||||
readCSVString(tmp, buf, format_settings.csv);
|
||||
|
@ -1212,4 +1212,96 @@ void skipToNextRowOrEof(PeekableReadBuffer & buf, const String & row_after_delim
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <char opening_bracket, char closing_bracket>
|
||||
static void readQuotedFieldInBrackets(String & s, ReadBuffer & buf)
|
||||
{
|
||||
assertChar(opening_bracket, buf);
|
||||
s.push_back(opening_bracket);
|
||||
|
||||
size_t balance = 1;
|
||||
|
||||
while (!buf.eof() && balance)
|
||||
{
|
||||
char * next_pos = find_first_symbols<'\'', opening_bracket, closing_bracket>(buf.position(), buf.buffer().end());
|
||||
appendToStringOrVector(s, buf, next_pos);
|
||||
buf.position() = next_pos;
|
||||
|
||||
if (!buf.hasPendingData())
|
||||
continue;
|
||||
|
||||
s.push_back(*buf.position());
|
||||
|
||||
if (*buf.position() == '\'')
|
||||
{
|
||||
readQuotedStringInto<false>(s, buf);
|
||||
s.push_back('\'');
|
||||
}
|
||||
else if (*buf.position() == opening_bracket)
|
||||
{
|
||||
++balance;
|
||||
++buf.position();
|
||||
}
|
||||
else if (*buf.position() == closing_bracket)
|
||||
{
|
||||
--balance;
|
||||
++buf.position();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void readQuotedFieldIntoString(String & s, ReadBuffer & buf)
|
||||
{
|
||||
s.clear();
|
||||
|
||||
if (buf.eof())
|
||||
return;
|
||||
|
||||
/// Possible values in 'Quoted' field:
|
||||
/// - Strings: '...'
|
||||
/// - Arrays: [...]
|
||||
/// - Tuples: (...)
|
||||
/// - Maps: {...}
|
||||
/// - NULL
|
||||
/// - Number: integer, float, decimal.
|
||||
|
||||
if (*buf.position() == '\'')
|
||||
readQuotedString(s, buf);
|
||||
else if (*buf.position() == '[')
|
||||
readQuotedFieldInBrackets<'[', ']'>(s, buf);
|
||||
else if (*buf.position() == '(')
|
||||
readQuotedFieldInBrackets<'(', ')'>(s, buf);
|
||||
else if (*buf.position() == '{')
|
||||
readQuotedFieldInBrackets<'{', '}'>(s, buf);
|
||||
else if (checkCharCaseInsensitive('n', buf))
|
||||
{
|
||||
/// NULL or NaN
|
||||
if (checkCharCaseInsensitive('u', buf))
|
||||
{
|
||||
assertStringCaseInsensitive("ll", buf);
|
||||
s.append("NULL");
|
||||
}
|
||||
else
|
||||
{
|
||||
assertStringCaseInsensitive("an", buf);
|
||||
s.append("NaN");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/// It's an integer, float or decimal. They all can be parsed as float.
|
||||
/// Use PeekableReadBuffer to copy field to string after parsing.
|
||||
PeekableReadBuffer peekable_buf(buf);
|
||||
peekable_buf.setCheckpoint();
|
||||
Float64 tmp;
|
||||
readFloatText(tmp, peekable_buf);
|
||||
peekable_buf.makeContinuousMemoryFromCheckpointToPos();
|
||||
auto * end = peekable_buf.position();
|
||||
peekable_buf.rollbackToCheckpoint();
|
||||
s.append(peekable_buf.position(), end);
|
||||
peekable_buf.position() = end;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -184,6 +184,15 @@ inline void assertChar(char symbol, ReadBuffer & buf)
|
||||
}
|
||||
}
|
||||
|
||||
inline bool checkCharCaseInsensitive(char c, ReadBuffer & buf)
|
||||
{
|
||||
char a;
|
||||
if (!buf.peek(a) || !equalsCaseInsensitive(a, c))
|
||||
return false;
|
||||
buf.ignore();
|
||||
return true;
|
||||
}
|
||||
|
||||
inline void assertString(const String & s, ReadBuffer & buf)
|
||||
{
|
||||
assertString(s.c_str(), buf);
|
||||
@ -1375,4 +1384,6 @@ struct PcgDeserializer
|
||||
}
|
||||
};
|
||||
|
||||
void readQuotedFieldIntoString(String & s, ReadBuffer & buf);
|
||||
|
||||
}
|
||||
|
@ -205,7 +205,7 @@ void CustomSeparatedRowInputFormat::syncAfterError()
|
||||
|
||||
bool CustomSeparatedRowInputFormat::parseRowStartWithDiagnosticInfo(WriteBuffer & out)
|
||||
{
|
||||
return parseDelimiterWithDiagnosticInfo(out, buf, format_settings.custom.row_before_delimiter, "delimiter before first firld", ignore_spaces);
|
||||
return parseDelimiterWithDiagnosticInfo(out, buf, format_settings.custom.row_before_delimiter, "delimiter before first field", ignore_spaces);
|
||||
}
|
||||
|
||||
bool CustomSeparatedRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out)
|
||||
|
26
tests/queries/0_stateless/02129_skip_quoted_fields.reference
Normal file
26
tests/queries/0_stateless/02129_skip_quoted_fields.reference
Normal file
@ -0,0 +1,26 @@
|
||||
1 42
|
||||
2 42
|
||||
3 42
|
||||
4 42
|
||||
5 42
|
||||
6 42
|
||||
7 42
|
||||
8 42
|
||||
9 42
|
||||
10 42
|
||||
11 42
|
||||
12 42
|
||||
13 42
|
||||
14 42
|
||||
15 42
|
||||
16 42
|
||||
17 42
|
||||
18 42
|
||||
19 42
|
||||
20 42
|
||||
21 42
|
||||
22 42
|
||||
23 42
|
||||
24 42
|
||||
25 42
|
||||
26 42
|
53
tests/queries/0_stateless/02129_skip_quoted_fields.sh
Executable file
53
tests/queries/0_stateless/02129_skip_quoted_fields.sh
Executable file
@ -0,0 +1,53 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-parallel
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "drop table if exists test_02129"
|
||||
$CLICKHOUSE_CLIENT -q "create table test_02129 (x UInt64, y UInt64) engine=Memory()"
|
||||
|
||||
QUERY="insert into test_02129 format CustomSeparatedWithNames settings input_format_skip_unknown_fields=1, format_custom_escaping_rule='Quoted'"
|
||||
|
||||
# Skip string
|
||||
echo -e "'x'\t'trash'\t'y'\n1\t'Some string'\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||
|
||||
# Skip number
|
||||
echo -e "'x'\t'trash'\t'y'\n2\t42\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||
echo -e "'x'\t'trash'\t'y'\n3\t4242.4242\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||
echo -e "'x'\t'trash'\t'y'\n4\t-42\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||
echo -e "'x'\t'trash'\t'y'\n5\t+42\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||
echo -e "'x'\t'trash'\t'y'\n6\t-4242.424242\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||
echo -e "'x'\t'trash'\t'y'\n7\t+4242.424242\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||
echo -e "'x'\t'trash'\t'y'\n8\tnan\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||
echo -e "'x'\t'trash'\t'y'\n9\tinf\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||
echo -e "'x'\t'trash'\t'y'\n10\t+nan\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||
echo -e "'x'\t'trash'\t'y'\n11\t+inf\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||
echo -e "'x'\t'trash'\t'y'\n12\t-nan\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||
echo -e "'x'\t'trash'\t'y'\n13\t-inf\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||
echo -e "'x'\t'trash'\t'y'\n14\t44444444444444444444444444.444444444444444444444444\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||
echo -e "'x'\t'trash'\t'y'\n15\t30e30\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||
echo -e "'x'\t'trash'\t'y'\n16\t-30e-30\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||
|
||||
# Skip NULL
|
||||
echo -e "'x'\t'trash'\t'y'\n17\tNULL\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||
|
||||
# Skip an array
|
||||
echo -e "'x'\t'trash'\t'y'\n18\t[1,2,3,4]\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||
echo -e "'x'\t'trash'\t'y'\n19\t['some string ]][[][][]', 'one more string (){}][[{[[[[[[']\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||
echo -e "'x'\t'trash'\t'y'\n20\t[[(1,2), (3,4)], [(5,6), (7,8)]]\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||
|
||||
# Skip a tuple
|
||||
echo -e "'x'\t'trash'\t'y'\n21\t(1,2,3,4)\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||
echo -e "'x'\t'trash'\t'y'\n22\t('some string ()))))(()(())', 'one more string (){}][[{[)))))')\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||
echo -e "'x'\t'trash'\t'y'\n23\t(([1,2], (3,4)), ([5,6], (7,8)))\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||
|
||||
# Skip a map
|
||||
echo -e "'x'\t'trash'\t'y'\n24\t{1:2,2:3,3:4,4:5}\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||
echo -e "'x'\t'trash'\t'y'\n25\t{'some string }}}}}}{{{{':123, 'one more string (){}][[{[{{{{{':123}\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||
echo -e "'x'\t'trash'\t'y'\n26\t{'key':{1:(1,2), 2:(3,4)}, 'foo':{1:(5,6), 2:(7,8)}}\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "select * from test_02129 order by x"
|
||||
$CLICKHOUSE_CLIENT -q "drop table test_02129"
|
||||
|
Loading…
Reference in New Issue
Block a user