Merge pull request #54340 from KevinyhZou/bug_fix_csv_parse_crlf

Fix CSV read while the end of line is CR(\r)
This commit is contained in:
Sema Checherinda 2023-11-07 12:51:05 +01:00 committed by GitHub
commit 02df5ac59c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 32 additions and 5 deletions

View File

@ -897,6 +897,12 @@ Use DOS/Windows-style line separator (CRLF) in CSV instead of Unix style (LF).
Disabled by default.
### input_format_csv_allow_cr_end_of_line {#input_format_csv_allow_cr_end_of_line}
If it is set true, CR(\\r) will be allowed at end of line not followed by LF(\\n)
Disabled by default.
### input_format_csv_enum_as_number {#input_format_csv_enum_as_number}
When enabled, always treat enum values as enum ids for CSV input format. It's recommended to enable this setting if data contains only enum ids to optimize enum parsing.

View File

@ -886,6 +886,7 @@ class IColumn;
M(Bool, format_csv_allow_single_quotes, false, "If it is set to true, allow strings in single quotes.", 0) \
M(Bool, format_csv_allow_double_quotes, true, "If it is set to true, allow strings in double quotes.", 0) \
M(Bool, output_format_csv_crlf_end_of_line, false, "If it is set true, end of line in CSV format will be \\r\\n instead of \\n.", 0) \
M(Bool, input_format_csv_allow_cr_end_of_line, false, "If it is set true, \\r will be allowed at end of line not followed by \\n", 0) \
M(Bool, input_format_csv_enum_as_number, false, "Treat inserted enum values in CSV formats as enum indices", 0) \
M(Bool, input_format_csv_arrays_as_nested_csv, false, R"(When reading Array from CSV, expect that its elements were serialized in nested CSV and then put into string. Example: "[""Hello"", ""world"", ""42"""" TV""]". Braces around array can be omitted.)", 0) \
M(Bool, input_format_skip_unknown_fields, true, "Skip columns with unknown names from input data (it works for JSONEachRow, -WithNames, -WithNamesAndTypes and TSKV formats).", 0) \

View File

@ -59,6 +59,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.csv.allow_double_quotes = settings.format_csv_allow_double_quotes;
format_settings.csv.allow_single_quotes = settings.format_csv_allow_single_quotes;
format_settings.csv.crlf_end_of_line = settings.output_format_csv_crlf_end_of_line;
format_settings.csv.allow_cr_end_of_line = settings.input_format_csv_allow_cr_end_of_line;
format_settings.csv.delimiter = settings.format_csv_delimiter;
format_settings.csv.tuple_delimiter = settings.format_csv_delimiter;
format_settings.csv.empty_as_default = settings.input_format_csv_empty_as_default;

View File

@ -150,6 +150,7 @@ struct FormatSettings
bool allow_double_quotes = true;
bool empty_as_default = false;
bool crlf_end_of_line = false;
bool allow_cr_end_of_line = false;
bool enum_as_number = false;
bool arrays_as_nested_csv = false;
String null_representation = "\\N";

View File

@ -835,7 +835,7 @@ void readCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV &
/// Check for single '\r' not followed by '\n'
/// We should not stop in this case.
if (*buf.position() == '\r')
if (*buf.position() == '\r' && !settings.allow_cr_end_of_line)
{
++buf.position();
if (!buf.eof() && *buf.position() != '\n')

View File

@ -177,7 +177,7 @@ void CSVFormatReader::skipRow()
}
}
static void skipEndOfLine(ReadBuffer & in)
static void skipEndOfLine(ReadBuffer & in, bool allow_cr_end_of_line)
{
/// \n (Unix) or \r\n (DOS/Windows) or \n\r (Mac OS Classic)
@ -192,7 +192,7 @@ static void skipEndOfLine(ReadBuffer & in)
++in.position();
if (!in.eof() && *in.position() == '\n')
++in.position();
else
else if (!allow_cr_end_of_line)
throw Exception(ErrorCodes::INCORRECT_DATA,
"Cannot parse CSV format: found \\r (CR) not followed by \\n (LF)."
" Line must end by \\n (LF) or \\r\\n (CR LF) or \\n\\r.");
@ -258,7 +258,7 @@ void CSVFormatReader::skipRowEndDelimiter()
if (buf->eof())
return;
skipEndOfLine(*buf);
skipEndOfLine(*buf, format_settings.csv.allow_cr_end_of_line);
}
void CSVFormatReader::skipHeaderRow()
@ -343,7 +343,7 @@ bool CSVFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out)
return false;
}
skipEndOfLine(*buf);
skipEndOfLine(*buf, format_settings.csv.allow_cr_end_of_line);
return true;
}

View File

@ -0,0 +1,3 @@
A 110 208819249
B 112 208819248
C 123 783434434

View File

@ -0,0 +1,13 @@
#!/usr/bin/env bash
# NOTE: this sh wrapper is required because of shell_config
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
$CLICKHOUSE_CLIENT -q "drop table if exists test_tbl"
$CLICKHOUSE_CLIENT -q "create table test_tbl (a String, b String, c String) engine=MergeTree order by a"
cat $CURDIR/data_csv/csv_with_cr_end_of_line.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_tbl SETTINGS input_format_csv_allow_cr_end_of_line=true FORMAT CSV"
$CLICKHOUSE_CLIENT -q "select * from test_tbl"
$CLICKHOUSE_CLIENT -q "drop table test_tbl"

View File

@ -0,0 +1,2 @@
A,110,208819249
B,112,208819248 C,123,783434434
1 A,110,208819249
2 B,112,208819248 C,123,783434434