From 7549619b25c8a711cc2e3522c0e6631e0307528f Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 3 Dec 2021 16:25:35 +0300 Subject: [PATCH] Improve skiping unknown fields with Quoted escaping rule in Template/CustomSeparated formats --- src/Formats/EscapingRuleUtils.cpp | 5 +- src/IO/ReadHelpers.cpp | 92 +++++++++++++++++++ src/IO/ReadHelpers.h | 11 +++ .../Impl/CustomSeparatedRowInputFormat.cpp | 2 +- .../02129_skip_quoted_fields.reference | 26 ++++++ .../0_stateless/02129_skip_quoted_fields.sh | 53 +++++++++++ 6 files changed, 184 insertions(+), 5 deletions(-) create mode 100644 tests/queries/0_stateless/02129_skip_quoted_fields.reference create mode 100755 tests/queries/0_stateless/02129_skip_quoted_fields.sh diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index 2c2662a6a67..d956d9e6bfb 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -69,10 +69,7 @@ void skipFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule esca readEscapedString(tmp, buf); break; case FormatSettings::EscapingRule::Quoted: - /// FIXME: it skips only strings, not numbers, arrays or tuples. - /// we should read until delimiter and skip all data between - /// single quotes. - readQuotedString(tmp, buf); + readQuotedFieldIntoString(tmp, buf); break; case FormatSettings::EscapingRule::CSV: readCSVString(tmp, buf, format_settings.csv); diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index 675adc43ce6..b0a6838b81e 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -1212,4 +1212,96 @@ void skipToNextRowOrEof(PeekableReadBuffer & buf, const String & row_after_delim } } + +template +static void readQuotedFieldInBrackets(String & s, ReadBuffer & buf) +{ + assertChar(opening_bracket, buf); + s.push_back(opening_bracket); + + size_t balance = 1; + + while (!buf.eof() && balance) + { + char * next_pos = find_first_symbols<'\'', opening_bracket, closing_bracket>(buf.position(), buf.buffer().end()); + appendToStringOrVector(s, buf, next_pos); + buf.position() = next_pos; + + if (!buf.hasPendingData()) + continue; + + s.push_back(*buf.position()); + + if (*buf.position() == '\'') + { + readQuotedStringInto(s, buf); + s.push_back('\''); + } + else if (*buf.position() == opening_bracket) + { + ++balance; + ++buf.position(); + } + else if (*buf.position() == closing_bracket) + { + --balance; + ++buf.position(); + } + } +} + +void readQuotedFieldIntoString(String & s, ReadBuffer & buf) +{ + s.clear(); + + if (buf.eof()) + return; + + /// Possible values in 'Quoted' field: + /// - Strings: '...' + /// - Arrays: [...] + /// - Tuples: (...) + /// - Maps: {...} + /// - NULL + /// - Number: integer, float, decimal. + + if (*buf.position() == '\'') + readQuotedString(s, buf); + else if (*buf.position() == '[') + readQuotedFieldInBrackets<'[', ']'>(s, buf); + else if (*buf.position() == '(') + readQuotedFieldInBrackets<'(', ')'>(s, buf); + else if (*buf.position() == '{') + readQuotedFieldInBrackets<'{', '}'>(s, buf); + else if (checkCharCaseInsensitive('n', buf)) + { + /// NULL or NaN + if (checkCharCaseInsensitive('u', buf)) + { + assertStringCaseInsensitive("ll", buf); + s.append("NULL"); + } + else + { + assertStringCaseInsensitive("an", buf); + s.append("NaN"); + } + } + else + { + /// It's an integer, float or decimal. They all can be parsed as float. + /// Use PeekableReadBuffer to copy field to string after parsing. + PeekableReadBuffer peekable_buf(buf); + peekable_buf.setCheckpoint(); + Float64 tmp; + readFloatText(tmp, peekable_buf); + peekable_buf.makeContinuousMemoryFromCheckpointToPos(); + auto * end = peekable_buf.position(); + peekable_buf.rollbackToCheckpoint(); + s.append(peekable_buf.position(), end); + peekable_buf.position() = end; + } +} + + } diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index da59fc7973c..c48306cf6d3 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -184,6 +184,15 @@ inline void assertChar(char symbol, ReadBuffer & buf) } } +inline bool checkCharCaseInsensitive(char c, ReadBuffer & buf) +{ + char a; + if (!buf.peek(a) || !equalsCaseInsensitive(a, c)) + return false; + buf.ignore(); + return true; +} + inline void assertString(const String & s, ReadBuffer & buf) { assertString(s.c_str(), buf); @@ -1375,4 +1384,6 @@ struct PcgDeserializer } }; +void readQuotedFieldIntoString(String & s, ReadBuffer & buf); + } diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp index 6ff9a8cca2c..8cd9d154ae4 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp @@ -205,7 +205,7 @@ void CustomSeparatedRowInputFormat::syncAfterError() bool CustomSeparatedRowInputFormat::parseRowStartWithDiagnosticInfo(WriteBuffer & out) { - return parseDelimiterWithDiagnosticInfo(out, buf, format_settings.custom.row_before_delimiter, "delimiter before first firld", ignore_spaces); + return parseDelimiterWithDiagnosticInfo(out, buf, format_settings.custom.row_before_delimiter, "delimiter before first field", ignore_spaces); } bool CustomSeparatedRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) diff --git a/tests/queries/0_stateless/02129_skip_quoted_fields.reference b/tests/queries/0_stateless/02129_skip_quoted_fields.reference new file mode 100644 index 00000000000..312f526ca28 --- /dev/null +++ b/tests/queries/0_stateless/02129_skip_quoted_fields.reference @@ -0,0 +1,26 @@ +1 42 +2 42 +3 42 +4 42 +5 42 +6 42 +7 42 +8 42 +9 42 +10 42 +11 42 +12 42 +13 42 +14 42 +15 42 +16 42 +17 42 +18 42 +19 42 +20 42 +21 42 +22 42 +23 42 +24 42 +25 42 +26 42 diff --git a/tests/queries/0_stateless/02129_skip_quoted_fields.sh b/tests/queries/0_stateless/02129_skip_quoted_fields.sh new file mode 100755 index 00000000000..c1baeb5b8f2 --- /dev/null +++ b/tests/queries/0_stateless/02129_skip_quoted_fields.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +# Tags: no-parallel + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "drop table if exists test_02129" +$CLICKHOUSE_CLIENT -q "create table test_02129 (x UInt64, y UInt64) engine=Memory()" + +QUERY="insert into test_02129 format CustomSeparatedWithNames settings input_format_skip_unknown_fields=1, format_custom_escaping_rule='Quoted'" + +# Skip string +echo -e "'x'\t'trash'\t'y'\n1\t'Some string'\t42" | $CLICKHOUSE_CLIENT -q "$QUERY" + +# Skip number +echo -e "'x'\t'trash'\t'y'\n2\t42\t42" | $CLICKHOUSE_CLIENT -q "$QUERY" +echo -e "'x'\t'trash'\t'y'\n3\t4242.4242\t42" | $CLICKHOUSE_CLIENT -q "$QUERY" +echo -e "'x'\t'trash'\t'y'\n4\t-42\t42" | $CLICKHOUSE_CLIENT -q "$QUERY" +echo -e "'x'\t'trash'\t'y'\n5\t+42\t42" | $CLICKHOUSE_CLIENT -q "$QUERY" +echo -e "'x'\t'trash'\t'y'\n6\t-4242.424242\t42" | $CLICKHOUSE_CLIENT -q "$QUERY" +echo -e "'x'\t'trash'\t'y'\n7\t+4242.424242\t42" | $CLICKHOUSE_CLIENT -q "$QUERY" +echo -e "'x'\t'trash'\t'y'\n8\tnan\t42" | $CLICKHOUSE_CLIENT -q "$QUERY" +echo -e "'x'\t'trash'\t'y'\n9\tinf\t42" | $CLICKHOUSE_CLIENT -q "$QUERY" +echo -e "'x'\t'trash'\t'y'\n10\t+nan\t42" | $CLICKHOUSE_CLIENT -q "$QUERY" +echo -e "'x'\t'trash'\t'y'\n11\t+inf\t42" | $CLICKHOUSE_CLIENT -q "$QUERY" +echo -e "'x'\t'trash'\t'y'\n12\t-nan\t42" | $CLICKHOUSE_CLIENT -q "$QUERY" +echo -e "'x'\t'trash'\t'y'\n13\t-inf\t42" | $CLICKHOUSE_CLIENT -q "$QUERY" +echo -e "'x'\t'trash'\t'y'\n14\t44444444444444444444444444.444444444444444444444444\t42" | $CLICKHOUSE_CLIENT -q "$QUERY" +echo -e "'x'\t'trash'\t'y'\n15\t30e30\t42" | $CLICKHOUSE_CLIENT -q "$QUERY" +echo -e "'x'\t'trash'\t'y'\n16\t-30e-30\t42" | $CLICKHOUSE_CLIENT -q "$QUERY" + +# Skip NULL +echo -e "'x'\t'trash'\t'y'\n17\tNULL\t42" | $CLICKHOUSE_CLIENT -q "$QUERY" + +# Skip an array +echo -e "'x'\t'trash'\t'y'\n18\t[1,2,3,4]\t42" | $CLICKHOUSE_CLIENT -q "$QUERY" +echo -e "'x'\t'trash'\t'y'\n19\t['some string ]][[][][]', 'one more string (){}][[{[[[[[[']\t42" | $CLICKHOUSE_CLIENT -q "$QUERY" +echo -e "'x'\t'trash'\t'y'\n20\t[[(1,2), (3,4)], [(5,6), (7,8)]]\t42" | $CLICKHOUSE_CLIENT -q "$QUERY" + +# Skip a tuple +echo -e "'x'\t'trash'\t'y'\n21\t(1,2,3,4)\t42" | $CLICKHOUSE_CLIENT -q "$QUERY" +echo -e "'x'\t'trash'\t'y'\n22\t('some string ()))))(()(())', 'one more string (){}][[{[)))))')\t42" | $CLICKHOUSE_CLIENT -q "$QUERY" +echo -e "'x'\t'trash'\t'y'\n23\t(([1,2], (3,4)), ([5,6], (7,8)))\t42" | $CLICKHOUSE_CLIENT -q "$QUERY" + +# Skip a map +echo -e "'x'\t'trash'\t'y'\n24\t{1:2,2:3,3:4,4:5}\t42" | $CLICKHOUSE_CLIENT -q "$QUERY" +echo -e "'x'\t'trash'\t'y'\n25\t{'some string }}}}}}{{{{':123, 'one more string (){}][[{[{{{{{':123}\t42" | $CLICKHOUSE_CLIENT -q "$QUERY" +echo -e "'x'\t'trash'\t'y'\n26\t{'key':{1:(1,2), 2:(3,4)}, 'foo':{1:(5,6), 2:(7,8)}}\t42" | $CLICKHOUSE_CLIENT -q "$QUERY" + +$CLICKHOUSE_CLIENT -q "select * from test_02129 order by x" +$CLICKHOUSE_CLIENT -q "drop table test_02129" +