mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 08:40:50 +00:00
Merge pull request #32204 from Avogar/skip-quoted-values
Improve skiping unknown fields with Quoted escaping rule in Template/CustomSeparated formats
This commit is contained in:
commit
cc71c537bc
@ -69,10 +69,7 @@ void skipFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule esca
|
|||||||
readEscapedString(tmp, buf);
|
readEscapedString(tmp, buf);
|
||||||
break;
|
break;
|
||||||
case FormatSettings::EscapingRule::Quoted:
|
case FormatSettings::EscapingRule::Quoted:
|
||||||
/// FIXME: it skips only strings, not numbers, arrays or tuples.
|
readQuotedFieldIntoString(tmp, buf);
|
||||||
/// we should read until delimiter and skip all data between
|
|
||||||
/// single quotes.
|
|
||||||
readQuotedString(tmp, buf);
|
|
||||||
break;
|
break;
|
||||||
case FormatSettings::EscapingRule::CSV:
|
case FormatSettings::EscapingRule::CSV:
|
||||||
readCSVString(tmp, buf, format_settings.csv);
|
readCSVString(tmp, buf, format_settings.csv);
|
||||||
|
@ -1212,4 +1212,96 @@ void skipToNextRowOrEof(PeekableReadBuffer & buf, const String & row_after_delim
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <char opening_bracket, char closing_bracket>
|
||||||
|
static void readQuotedFieldInBrackets(String & s, ReadBuffer & buf)
|
||||||
|
{
|
||||||
|
assertChar(opening_bracket, buf);
|
||||||
|
s.push_back(opening_bracket);
|
||||||
|
|
||||||
|
size_t balance = 1;
|
||||||
|
|
||||||
|
while (!buf.eof() && balance)
|
||||||
|
{
|
||||||
|
char * next_pos = find_first_symbols<'\'', opening_bracket, closing_bracket>(buf.position(), buf.buffer().end());
|
||||||
|
appendToStringOrVector(s, buf, next_pos);
|
||||||
|
buf.position() = next_pos;
|
||||||
|
|
||||||
|
if (!buf.hasPendingData())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
s.push_back(*buf.position());
|
||||||
|
|
||||||
|
if (*buf.position() == '\'')
|
||||||
|
{
|
||||||
|
readQuotedStringInto<false>(s, buf);
|
||||||
|
s.push_back('\'');
|
||||||
|
}
|
||||||
|
else if (*buf.position() == opening_bracket)
|
||||||
|
{
|
||||||
|
++balance;
|
||||||
|
++buf.position();
|
||||||
|
}
|
||||||
|
else if (*buf.position() == closing_bracket)
|
||||||
|
{
|
||||||
|
--balance;
|
||||||
|
++buf.position();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void readQuotedFieldIntoString(String & s, ReadBuffer & buf)
|
||||||
|
{
|
||||||
|
s.clear();
|
||||||
|
|
||||||
|
if (buf.eof())
|
||||||
|
return;
|
||||||
|
|
||||||
|
/// Possible values in 'Quoted' field:
|
||||||
|
/// - Strings: '...'
|
||||||
|
/// - Arrays: [...]
|
||||||
|
/// - Tuples: (...)
|
||||||
|
/// - Maps: {...}
|
||||||
|
/// - NULL
|
||||||
|
/// - Number: integer, float, decimal.
|
||||||
|
|
||||||
|
if (*buf.position() == '\'')
|
||||||
|
readQuotedString(s, buf);
|
||||||
|
else if (*buf.position() == '[')
|
||||||
|
readQuotedFieldInBrackets<'[', ']'>(s, buf);
|
||||||
|
else if (*buf.position() == '(')
|
||||||
|
readQuotedFieldInBrackets<'(', ')'>(s, buf);
|
||||||
|
else if (*buf.position() == '{')
|
||||||
|
readQuotedFieldInBrackets<'{', '}'>(s, buf);
|
||||||
|
else if (checkCharCaseInsensitive('n', buf))
|
||||||
|
{
|
||||||
|
/// NULL or NaN
|
||||||
|
if (checkCharCaseInsensitive('u', buf))
|
||||||
|
{
|
||||||
|
assertStringCaseInsensitive("ll", buf);
|
||||||
|
s.append("NULL");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
assertStringCaseInsensitive("an", buf);
|
||||||
|
s.append("NaN");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/// It's an integer, float or decimal. They all can be parsed as float.
|
||||||
|
/// Use PeekableReadBuffer to copy field to string after parsing.
|
||||||
|
PeekableReadBuffer peekable_buf(buf);
|
||||||
|
peekable_buf.setCheckpoint();
|
||||||
|
Float64 tmp;
|
||||||
|
readFloatText(tmp, peekable_buf);
|
||||||
|
peekable_buf.makeContinuousMemoryFromCheckpointToPos();
|
||||||
|
auto * end = peekable_buf.position();
|
||||||
|
peekable_buf.rollbackToCheckpoint();
|
||||||
|
s.append(peekable_buf.position(), end);
|
||||||
|
peekable_buf.position() = end;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -184,6 +184,15 @@ inline void assertChar(char symbol, ReadBuffer & buf)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline bool checkCharCaseInsensitive(char c, ReadBuffer & buf)
|
||||||
|
{
|
||||||
|
char a;
|
||||||
|
if (!buf.peek(a) || !equalsCaseInsensitive(a, c))
|
||||||
|
return false;
|
||||||
|
buf.ignore();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
inline void assertString(const String & s, ReadBuffer & buf)
|
inline void assertString(const String & s, ReadBuffer & buf)
|
||||||
{
|
{
|
||||||
assertString(s.c_str(), buf);
|
assertString(s.c_str(), buf);
|
||||||
@ -1375,4 +1384,6 @@ struct PcgDeserializer
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
void readQuotedFieldIntoString(String & s, ReadBuffer & buf);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -205,7 +205,7 @@ void CustomSeparatedRowInputFormat::syncAfterError()
|
|||||||
|
|
||||||
bool CustomSeparatedRowInputFormat::parseRowStartWithDiagnosticInfo(WriteBuffer & out)
|
bool CustomSeparatedRowInputFormat::parseRowStartWithDiagnosticInfo(WriteBuffer & out)
|
||||||
{
|
{
|
||||||
return parseDelimiterWithDiagnosticInfo(out, buf, format_settings.custom.row_before_delimiter, "delimiter before first firld", ignore_spaces);
|
return parseDelimiterWithDiagnosticInfo(out, buf, format_settings.custom.row_before_delimiter, "delimiter before first field", ignore_spaces);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool CustomSeparatedRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out)
|
bool CustomSeparatedRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out)
|
||||||
|
26
tests/queries/0_stateless/02129_skip_quoted_fields.reference
Normal file
26
tests/queries/0_stateless/02129_skip_quoted_fields.reference
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
1 42
|
||||||
|
2 42
|
||||||
|
3 42
|
||||||
|
4 42
|
||||||
|
5 42
|
||||||
|
6 42
|
||||||
|
7 42
|
||||||
|
8 42
|
||||||
|
9 42
|
||||||
|
10 42
|
||||||
|
11 42
|
||||||
|
12 42
|
||||||
|
13 42
|
||||||
|
14 42
|
||||||
|
15 42
|
||||||
|
16 42
|
||||||
|
17 42
|
||||||
|
18 42
|
||||||
|
19 42
|
||||||
|
20 42
|
||||||
|
21 42
|
||||||
|
22 42
|
||||||
|
23 42
|
||||||
|
24 42
|
||||||
|
25 42
|
||||||
|
26 42
|
53
tests/queries/0_stateless/02129_skip_quoted_fields.sh
Executable file
53
tests/queries/0_stateless/02129_skip_quoted_fields.sh
Executable file
@ -0,0 +1,53 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Tags: no-parallel
|
||||||
|
|
||||||
|
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||||
|
# shellcheck source=../shell_config.sh
|
||||||
|
. "$CURDIR"/../shell_config.sh
|
||||||
|
|
||||||
|
$CLICKHOUSE_CLIENT -q "drop table if exists test_02129"
|
||||||
|
$CLICKHOUSE_CLIENT -q "create table test_02129 (x UInt64, y UInt64) engine=Memory()"
|
||||||
|
|
||||||
|
QUERY="insert into test_02129 format CustomSeparatedWithNames settings input_format_skip_unknown_fields=1, format_custom_escaping_rule='Quoted'"
|
||||||
|
|
||||||
|
# Skip string
|
||||||
|
echo -e "'x'\t'trash'\t'y'\n1\t'Some string'\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||||
|
|
||||||
|
# Skip number
|
||||||
|
echo -e "'x'\t'trash'\t'y'\n2\t42\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||||
|
echo -e "'x'\t'trash'\t'y'\n3\t4242.4242\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||||
|
echo -e "'x'\t'trash'\t'y'\n4\t-42\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||||
|
echo -e "'x'\t'trash'\t'y'\n5\t+42\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||||
|
echo -e "'x'\t'trash'\t'y'\n6\t-4242.424242\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||||
|
echo -e "'x'\t'trash'\t'y'\n7\t+4242.424242\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||||
|
echo -e "'x'\t'trash'\t'y'\n8\tnan\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||||
|
echo -e "'x'\t'trash'\t'y'\n9\tinf\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||||
|
echo -e "'x'\t'trash'\t'y'\n10\t+nan\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||||
|
echo -e "'x'\t'trash'\t'y'\n11\t+inf\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||||
|
echo -e "'x'\t'trash'\t'y'\n12\t-nan\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||||
|
echo -e "'x'\t'trash'\t'y'\n13\t-inf\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||||
|
echo -e "'x'\t'trash'\t'y'\n14\t44444444444444444444444444.444444444444444444444444\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||||
|
echo -e "'x'\t'trash'\t'y'\n15\t30e30\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||||
|
echo -e "'x'\t'trash'\t'y'\n16\t-30e-30\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||||
|
|
||||||
|
# Skip NULL
|
||||||
|
echo -e "'x'\t'trash'\t'y'\n17\tNULL\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||||
|
|
||||||
|
# Skip an array
|
||||||
|
echo -e "'x'\t'trash'\t'y'\n18\t[1,2,3,4]\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||||
|
echo -e "'x'\t'trash'\t'y'\n19\t['some string ]][[][][]', 'one more string (){}][[{[[[[[[']\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||||
|
echo -e "'x'\t'trash'\t'y'\n20\t[[(1,2), (3,4)], [(5,6), (7,8)]]\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||||
|
|
||||||
|
# Skip a tuple
|
||||||
|
echo -e "'x'\t'trash'\t'y'\n21\t(1,2,3,4)\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||||
|
echo -e "'x'\t'trash'\t'y'\n22\t('some string ()))))(()(())', 'one more string (){}][[{[)))))')\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||||
|
echo -e "'x'\t'trash'\t'y'\n23\t(([1,2], (3,4)), ([5,6], (7,8)))\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||||
|
|
||||||
|
# Skip a map
|
||||||
|
echo -e "'x'\t'trash'\t'y'\n24\t{1:2,2:3,3:4,4:5}\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||||
|
echo -e "'x'\t'trash'\t'y'\n25\t{'some string }}}}}}{{{{':123, 'one more string (){}][[{[{{{{{':123}\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||||
|
echo -e "'x'\t'trash'\t'y'\n26\t{'key':{1:(1,2), 2:(3,4)}, 'foo':{1:(5,6), 2:(7,8)}}\t42" | $CLICKHOUSE_CLIENT -q "$QUERY"
|
||||||
|
|
||||||
|
$CLICKHOUSE_CLIENT -q "select * from test_02129 order by x"
|
||||||
|
$CLICKHOUSE_CLIENT -q "drop table test_02129"
|
||||||
|
|
Loading…
Reference in New Issue
Block a user