bug fix csv read while end of line is not crlf

This commit is contained in:
kevinyhzou 2023-09-06 11:28:53 +08:00
parent 0f76ba83e0
commit ef30e6723d
6 changed files with 16 additions and 7 deletions

View File

@ -897,6 +897,12 @@ Use DOS/Windows-style line separator (CRLF) in CSV instead of Unix style (LF).
Disabled by default.
### input_format_csv_crlf_end_of_line {#input_format_csv_crlf_end_of_line}
Use DOS/Windows-style line separator (CRLF) in CSV instead of Unix style (LF).
Disabled by default.
### input_format_csv_enum_as_number {#input_format_csv_enum_as_number}
When enabled, always treat enum values as enum ids for CSV input format. It's recommended to enable this setting if data contains only enum ids to optimize enum parsing.

View File

@ -884,6 +884,7 @@ class IColumn;
M(Bool, format_csv_allow_single_quotes, false, "If it is set to true, allow strings in single quotes.", 0) \
M(Bool, format_csv_allow_double_quotes, true, "If it is set to true, allow strings in double quotes.", 0) \
M(Bool, output_format_csv_crlf_end_of_line, false, "If it is set true, end of line in CSV format will be \\r\\n instead of \\n.", 0) \
M(Bool, input_format_csv_crlf_end_of_line, false, "If it is set true, end of line in CSV format will be \\r\\n instead of \\n", 0) \
M(Bool, input_format_csv_enum_as_number, false, "Treat inserted enum values in CSV formats as enum indices", 0) \
M(Bool, input_format_csv_arrays_as_nested_csv, false, R"(When reading Array from CSV, expect that its elements were serialized in nested CSV and then put into string. Example: "[""Hello"", ""world"", ""42"""" TV""]". Braces around array can be omitted.)", 0) \
M(Bool, input_format_skip_unknown_fields, true, "Skip columns with unknown names from input data (it works for JSONEachRow, -WithNames, -WithNamesAndTypes and TSKV formats).", 0) \

View File

@ -58,7 +58,8 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.avro.output_rows_in_file = settings.output_format_avro_rows_in_file;
format_settings.csv.allow_double_quotes = settings.format_csv_allow_double_quotes;
format_settings.csv.allow_single_quotes = settings.format_csv_allow_single_quotes;
format_settings.csv.crlf_end_of_line = settings.output_format_csv_crlf_end_of_line;
format_settings.csv.crlf_end_of_line_for_output = settings.output_format_csv_crlf_end_of_line;
format_settings.csv.crlf_end_of_line_for_input = settings.input_format_csv_crlf_end_of_line;
format_settings.csv.delimiter = settings.format_csv_delimiter;
format_settings.csv.tuple_delimiter = settings.format_csv_delimiter;
format_settings.csv.empty_as_default = settings.input_format_csv_empty_as_default;

View File

@ -149,7 +149,8 @@ struct FormatSettings
bool allow_single_quotes = true;
bool allow_double_quotes = true;
bool empty_as_default = false;
bool crlf_end_of_line = false;
bool crlf_end_of_line_for_output = false;
bool crlf_end_of_line_for_input = false;
bool enum_as_number = false;
bool arrays_as_nested_csv = false;
String null_representation = "\\N";

View File

@ -177,7 +177,7 @@ void CSVFormatReader::skipRow()
}
}
static void skipEndOfLine(ReadBuffer & in)
static void skipEndOfLine(ReadBuffer & in, bool crlf_end_of_line)
{
/// \n (Unix) or \r\n (DOS/Windows) or \n\r (Mac OS Classic)
@ -192,7 +192,7 @@ static void skipEndOfLine(ReadBuffer & in)
++in.position();
if (!in.eof() && *in.position() == '\n')
++in.position();
else
else if (crlf_end_of_line)
throw Exception(ErrorCodes::INCORRECT_DATA,
"Cannot parse CSV format: found \\r (CR) not followed by \\n (LF)."
" Line must end by \\n (LF) or \\r\\n (CR LF) or \\n\\r.");
@ -258,7 +258,7 @@ void CSVFormatReader::skipRowEndDelimiter()
if (buf->eof())
return;
skipEndOfLine(*buf);
skipEndOfLine(*buf, format_settings.csv.crlf_end_of_line_for_input);
}
void CSVFormatReader::skipHeaderRow()
@ -343,7 +343,7 @@ bool CSVFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out)
return false;
}
skipEndOfLine(*buf);
skipEndOfLine(*buf, format_settings.csv.crlf_end_of_line_for_input);
return true;
}

View File

@ -56,7 +56,7 @@ void CSVRowOutputFormat::writeFieldDelimiter()
void CSVRowOutputFormat::writeRowEndDelimiter()
{
if (format_settings.csv.crlf_end_of_line)
if (format_settings.csv.crlf_end_of_line_for_output)
writeChar('\r', out);
writeChar('\n', out);
}