mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-22 07:31:57 +00:00
Merge pull request #54340 from KevinyhZou/bug_fix_csv_parse_crlf
Fix CSV read while the end of line is CR(\r)
This commit is contained in:
commit
02df5ac59c
@ -897,6 +897,12 @@ Use DOS/Windows-style line separator (CRLF) in CSV instead of Unix style (LF).
|
||||
|
||||
Disabled by default.
|
||||
|
||||
### input_format_csv_allow_cr_end_of_line {#input_format_csv_allow_cr_end_of_line}
|
||||
|
||||
If it is set true, CR(\\r) will be allowed at end of line not followed by LF(\\n)
|
||||
|
||||
Disabled by default.
|
||||
|
||||
### input_format_csv_enum_as_number {#input_format_csv_enum_as_number}
|
||||
|
||||
When enabled, always treat enum values as enum ids for CSV input format. It's recommended to enable this setting if data contains only enum ids to optimize enum parsing.
|
||||
|
@ -886,6 +886,7 @@ class IColumn;
|
||||
M(Bool, format_csv_allow_single_quotes, false, "If it is set to true, allow strings in single quotes.", 0) \
|
||||
M(Bool, format_csv_allow_double_quotes, true, "If it is set to true, allow strings in double quotes.", 0) \
|
||||
M(Bool, output_format_csv_crlf_end_of_line, false, "If it is set true, end of line in CSV format will be \\r\\n instead of \\n.", 0) \
|
||||
M(Bool, input_format_csv_allow_cr_end_of_line, false, "If it is set true, \\r will be allowed at end of line not followed by \\n", 0) \
|
||||
M(Bool, input_format_csv_enum_as_number, false, "Treat inserted enum values in CSV formats as enum indices", 0) \
|
||||
M(Bool, input_format_csv_arrays_as_nested_csv, false, R"(When reading Array from CSV, expect that its elements were serialized in nested CSV and then put into string. Example: "[""Hello"", ""world"", ""42"""" TV""]". Braces around array can be omitted.)", 0) \
|
||||
M(Bool, input_format_skip_unknown_fields, true, "Skip columns with unknown names from input data (it works for JSONEachRow, -WithNames, -WithNamesAndTypes and TSKV formats).", 0) \
|
||||
|
@ -59,6 +59,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.csv.allow_double_quotes = settings.format_csv_allow_double_quotes;
|
||||
format_settings.csv.allow_single_quotes = settings.format_csv_allow_single_quotes;
|
||||
format_settings.csv.crlf_end_of_line = settings.output_format_csv_crlf_end_of_line;
|
||||
format_settings.csv.allow_cr_end_of_line = settings.input_format_csv_allow_cr_end_of_line;
|
||||
format_settings.csv.delimiter = settings.format_csv_delimiter;
|
||||
format_settings.csv.tuple_delimiter = settings.format_csv_delimiter;
|
||||
format_settings.csv.empty_as_default = settings.input_format_csv_empty_as_default;
|
||||
|
@ -150,6 +150,7 @@ struct FormatSettings
|
||||
bool allow_double_quotes = true;
|
||||
bool empty_as_default = false;
|
||||
bool crlf_end_of_line = false;
|
||||
bool allow_cr_end_of_line = false;
|
||||
bool enum_as_number = false;
|
||||
bool arrays_as_nested_csv = false;
|
||||
String null_representation = "\\N";
|
||||
|
@ -835,7 +835,7 @@ void readCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV &
|
||||
|
||||
/// Check for single '\r' not followed by '\n'
|
||||
/// We should not stop in this case.
|
||||
if (*buf.position() == '\r')
|
||||
if (*buf.position() == '\r' && !settings.allow_cr_end_of_line)
|
||||
{
|
||||
++buf.position();
|
||||
if (!buf.eof() && *buf.position() != '\n')
|
||||
|
@ -177,7 +177,7 @@ void CSVFormatReader::skipRow()
|
||||
}
|
||||
}
|
||||
|
||||
static void skipEndOfLine(ReadBuffer & in)
|
||||
static void skipEndOfLine(ReadBuffer & in, bool allow_cr_end_of_line)
|
||||
{
|
||||
/// \n (Unix) or \r\n (DOS/Windows) or \n\r (Mac OS Classic)
|
||||
|
||||
@ -192,7 +192,7 @@ static void skipEndOfLine(ReadBuffer & in)
|
||||
++in.position();
|
||||
if (!in.eof() && *in.position() == '\n')
|
||||
++in.position();
|
||||
else
|
||||
else if (!allow_cr_end_of_line)
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA,
|
||||
"Cannot parse CSV format: found \\r (CR) not followed by \\n (LF)."
|
||||
" Line must end by \\n (LF) or \\r\\n (CR LF) or \\n\\r.");
|
||||
@ -258,7 +258,7 @@ void CSVFormatReader::skipRowEndDelimiter()
|
||||
if (buf->eof())
|
||||
return;
|
||||
|
||||
skipEndOfLine(*buf);
|
||||
skipEndOfLine(*buf, format_settings.csv.allow_cr_end_of_line);
|
||||
}
|
||||
|
||||
void CSVFormatReader::skipHeaderRow()
|
||||
@ -343,7 +343,7 @@ bool CSVFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out)
|
||||
return false;
|
||||
}
|
||||
|
||||
skipEndOfLine(*buf);
|
||||
skipEndOfLine(*buf, format_settings.csv.allow_cr_end_of_line);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,3 @@
|
||||
A 110 208819249
|
||||
B 112 208819248
|
||||
C 123 783434434
|
13
tests/queries/0_stateless/02891_input_csv_cr_end_of_line.sh
Executable file
13
tests/queries/0_stateless/02891_input_csv_cr_end_of_line.sh
Executable file
@ -0,0 +1,13 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# NOTE: this sh wrapper is required because of shell_config
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "drop table if exists test_tbl"
|
||||
$CLICKHOUSE_CLIENT -q "create table test_tbl (a String, b String, c String) engine=MergeTree order by a"
|
||||
cat $CURDIR/data_csv/csv_with_cr_end_of_line.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_tbl SETTINGS input_format_csv_allow_cr_end_of_line=true FORMAT CSV"
|
||||
$CLICKHOUSE_CLIENT -q "select * from test_tbl"
|
||||
$CLICKHOUSE_CLIENT -q "drop table test_tbl"
|
@ -0,0 +1,2 @@
|
||||
A,110,208819249
|
||||
B,112,208819248
C,123,783434434
|
|
Loading…
Reference in New Issue
Block a user