From ef30e6723d0cb9f9d0f4f4b2fb107df74d6d6866 Mon Sep 17 00:00:00 2001 From: kevinyhzou Date: Wed, 6 Sep 2023 11:28:53 +0800 Subject: [PATCH 1/4] bug fix csv read while end of line is not crlf --- docs/en/operations/settings/settings-formats.md | 6 ++++++ src/Core/Settings.h | 1 + src/Formats/FormatFactory.cpp | 3 ++- src/Formats/FormatSettings.h | 3 ++- src/Processors/Formats/Impl/CSVRowInputFormat.cpp | 8 ++++---- src/Processors/Formats/Impl/CSVRowOutputFormat.cpp | 2 +- 6 files changed, 16 insertions(+), 7 deletions(-) diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index bb59402079e..266f8f8bd66 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -897,6 +897,12 @@ Use DOS/Windows-style line separator (CRLF) in CSV instead of Unix style (LF). Disabled by default. +### input_format_csv_crlf_end_of_line {#input_format_csv_crlf_end_of_line} + +Use DOS/Windows-style line separator (CRLF) in CSV instead of Unix style (LF). + +Disabled by default. + ### input_format_csv_enum_as_number {#input_format_csv_enum_as_number} When enabled, always treat enum values as enum ids for CSV input format. It's recommended to enable this setting if data contains only enum ids to optimize enum parsing. diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 609ade4cdc0..c35393ba353 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -884,6 +884,7 @@ class IColumn; M(Bool, format_csv_allow_single_quotes, false, "If it is set to true, allow strings in single quotes.", 0) \ M(Bool, format_csv_allow_double_quotes, true, "If it is set to true, allow strings in double quotes.", 0) \ M(Bool, output_format_csv_crlf_end_of_line, false, "If it is set true, end of line in CSV format will be \\r\\n instead of \\n.", 0) \ + M(Bool, input_format_csv_crlf_end_of_line, false, "If it is set true, end of line in CSV format will be \\r\\n instead of \\n", 0) \ M(Bool, input_format_csv_enum_as_number, false, "Treat inserted enum values in CSV formats as enum indices", 0) \ M(Bool, input_format_csv_arrays_as_nested_csv, false, R"(When reading Array from CSV, expect that its elements were serialized in nested CSV and then put into string. Example: "[""Hello"", ""world"", ""42"""" TV""]". Braces around array can be omitted.)", 0) \ M(Bool, input_format_skip_unknown_fields, true, "Skip columns with unknown names from input data (it works for JSONEachRow, -WithNames, -WithNamesAndTypes and TSKV formats).", 0) \ diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index d51ea9ad2d0..168a4dc48bb 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -58,7 +58,8 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.avro.output_rows_in_file = settings.output_format_avro_rows_in_file; format_settings.csv.allow_double_quotes = settings.format_csv_allow_double_quotes; format_settings.csv.allow_single_quotes = settings.format_csv_allow_single_quotes; - format_settings.csv.crlf_end_of_line = settings.output_format_csv_crlf_end_of_line; + format_settings.csv.crlf_end_of_line_for_output = settings.output_format_csv_crlf_end_of_line; + format_settings.csv.crlf_end_of_line_for_input = settings.input_format_csv_crlf_end_of_line; format_settings.csv.delimiter = settings.format_csv_delimiter; format_settings.csv.tuple_delimiter = settings.format_csv_delimiter; format_settings.csv.empty_as_default = settings.input_format_csv_empty_as_default; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 382f8b7173a..fe4c9694ec9 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -149,7 +149,8 @@ struct FormatSettings bool allow_single_quotes = true; bool allow_double_quotes = true; bool empty_as_default = false; - bool crlf_end_of_line = false; + bool crlf_end_of_line_for_output = false; + bool crlf_end_of_line_for_input = false; bool enum_as_number = false; bool arrays_as_nested_csv = false; String null_representation = "\\N"; diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index baaff8b497b..2e6f6e5e0e2 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -177,7 +177,7 @@ void CSVFormatReader::skipRow() } } -static void skipEndOfLine(ReadBuffer & in) +static void skipEndOfLine(ReadBuffer & in, bool crlf_end_of_line) { /// \n (Unix) or \r\n (DOS/Windows) or \n\r (Mac OS Classic) @@ -192,7 +192,7 @@ static void skipEndOfLine(ReadBuffer & in) ++in.position(); if (!in.eof() && *in.position() == '\n') ++in.position(); - else + else if (crlf_end_of_line) throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse CSV format: found \\r (CR) not followed by \\n (LF)." " Line must end by \\n (LF) or \\r\\n (CR LF) or \\n\\r."); @@ -258,7 +258,7 @@ void CSVFormatReader::skipRowEndDelimiter() if (buf->eof()) return; - skipEndOfLine(*buf); + skipEndOfLine(*buf, format_settings.csv.crlf_end_of_line_for_input); } void CSVFormatReader::skipHeaderRow() @@ -343,7 +343,7 @@ bool CSVFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) return false; } - skipEndOfLine(*buf); + skipEndOfLine(*buf, format_settings.csv.crlf_end_of_line_for_input); return true; } diff --git a/src/Processors/Formats/Impl/CSVRowOutputFormat.cpp b/src/Processors/Formats/Impl/CSVRowOutputFormat.cpp index 304e877aae9..965a339887d 100644 --- a/src/Processors/Formats/Impl/CSVRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowOutputFormat.cpp @@ -56,7 +56,7 @@ void CSVRowOutputFormat::writeFieldDelimiter() void CSVRowOutputFormat::writeRowEndDelimiter() { - if (format_settings.csv.crlf_end_of_line) + if (format_settings.csv.crlf_end_of_line_for_output) writeChar('\r', out); writeChar('\n', out); } From 2a50daf5dde6c312cbc2df0b69c363807e9dd48f Mon Sep 17 00:00:00 2001 From: kevinyhzou Date: Fri, 27 Oct 2023 15:43:44 +0800 Subject: [PATCH 2/4] Allow cr at end of csv line --- docs/en/operations/settings/settings-formats.md | 4 ++-- src/Core/Settings.h | 2 +- src/Formats/FormatFactory.cpp | 4 ++-- src/Formats/FormatSettings.h | 4 ++-- src/Processors/Formats/Impl/CSVRowInputFormat.cpp | 8 ++++---- src/Processors/Formats/Impl/CSVRowOutputFormat.cpp | 2 +- .../02891_input_csv_cr_end_of_line.reference | 3 +++ .../0_stateless/02891_input_csv_cr_end_of_line.sh | 13 +++++++++++++ .../data_csv/csv_with_cr_end_of_line.csv | 2 ++ 9 files changed, 30 insertions(+), 12 deletions(-) create mode 100644 tests/queries/0_stateless/02891_input_csv_cr_end_of_line.reference create mode 100644 tests/queries/0_stateless/02891_input_csv_cr_end_of_line.sh create mode 100644 tests/queries/0_stateless/data_csv/csv_with_cr_end_of_line.csv diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index 266f8f8bd66..344e6dda680 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -897,9 +897,9 @@ Use DOS/Windows-style line separator (CRLF) in CSV instead of Unix style (LF). Disabled by default. -### input_format_csv_crlf_end_of_line {#input_format_csv_crlf_end_of_line} +### input_format_csv_allow_cr_end_of_line {#input_format_csv_allow_cr_end_of_line} -Use DOS/Windows-style line separator (CRLF) in CSV instead of Unix style (LF). +If it is set true, CR(\\r) will be allowed at end of line not followed by LF(\\n) Disabled by default. diff --git a/src/Core/Settings.h b/src/Core/Settings.h index c35393ba353..06438fa5389 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -884,7 +884,7 @@ class IColumn; M(Bool, format_csv_allow_single_quotes, false, "If it is set to true, allow strings in single quotes.", 0) \ M(Bool, format_csv_allow_double_quotes, true, "If it is set to true, allow strings in double quotes.", 0) \ M(Bool, output_format_csv_crlf_end_of_line, false, "If it is set true, end of line in CSV format will be \\r\\n instead of \\n.", 0) \ - M(Bool, input_format_csv_crlf_end_of_line, false, "If it is set true, end of line in CSV format will be \\r\\n instead of \\n", 0) \ + M(Bool, input_format_csv_allow_cr_end_of_line, false, "If it is set true, \\r will be allowed at end of line not followed by \\n", 0) \ M(Bool, input_format_csv_enum_as_number, false, "Treat inserted enum values in CSV formats as enum indices", 0) \ M(Bool, input_format_csv_arrays_as_nested_csv, false, R"(When reading Array from CSV, expect that its elements were serialized in nested CSV and then put into string. Example: "[""Hello"", ""world"", ""42"""" TV""]". Braces around array can be omitted.)", 0) \ M(Bool, input_format_skip_unknown_fields, true, "Skip columns with unknown names from input data (it works for JSONEachRow, -WithNames, -WithNamesAndTypes and TSKV formats).", 0) \ diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 168a4dc48bb..7fb355b6c43 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -58,8 +58,8 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.avro.output_rows_in_file = settings.output_format_avro_rows_in_file; format_settings.csv.allow_double_quotes = settings.format_csv_allow_double_quotes; format_settings.csv.allow_single_quotes = settings.format_csv_allow_single_quotes; - format_settings.csv.crlf_end_of_line_for_output = settings.output_format_csv_crlf_end_of_line; - format_settings.csv.crlf_end_of_line_for_input = settings.input_format_csv_crlf_end_of_line; + format_settings.csv.crlf_end_of_line = settings.output_format_csv_crlf_end_of_line; + format_settings.csv.allow_cr_end_of_line = settings.input_format_csv_allow_cr_end_of_line; format_settings.csv.delimiter = settings.format_csv_delimiter; format_settings.csv.tuple_delimiter = settings.format_csv_delimiter; format_settings.csv.empty_as_default = settings.input_format_csv_empty_as_default; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index fe4c9694ec9..9f99a47d4d5 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -149,8 +149,8 @@ struct FormatSettings bool allow_single_quotes = true; bool allow_double_quotes = true; bool empty_as_default = false; - bool crlf_end_of_line_for_output = false; - bool crlf_end_of_line_for_input = false; + bool crlf_end_of_line = false; + bool allow_cr_end_of_line = false; bool enum_as_number = false; bool arrays_as_nested_csv = false; String null_representation = "\\N"; diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 2e6f6e5e0e2..9ea42de3d32 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -177,7 +177,7 @@ void CSVFormatReader::skipRow() } } -static void skipEndOfLine(ReadBuffer & in, bool crlf_end_of_line) +static void skipEndOfLine(ReadBuffer & in, bool allow_cr_end_of_line) { /// \n (Unix) or \r\n (DOS/Windows) or \n\r (Mac OS Classic) @@ -192,7 +192,7 @@ static void skipEndOfLine(ReadBuffer & in, bool crlf_end_of_line) ++in.position(); if (!in.eof() && *in.position() == '\n') ++in.position(); - else if (crlf_end_of_line) + else if (!allow_cr_end_of_line) throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse CSV format: found \\r (CR) not followed by \\n (LF)." " Line must end by \\n (LF) or \\r\\n (CR LF) or \\n\\r."); @@ -258,7 +258,7 @@ void CSVFormatReader::skipRowEndDelimiter() if (buf->eof()) return; - skipEndOfLine(*buf, format_settings.csv.crlf_end_of_line_for_input); + skipEndOfLine(*buf, format_settings.csv.allow_cr_end_of_line); } void CSVFormatReader::skipHeaderRow() @@ -343,7 +343,7 @@ bool CSVFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) return false; } - skipEndOfLine(*buf, format_settings.csv.crlf_end_of_line_for_input); + skipEndOfLine(*buf, format_settings.csv.allow_cr_end_of_line); return true; } diff --git a/src/Processors/Formats/Impl/CSVRowOutputFormat.cpp b/src/Processors/Formats/Impl/CSVRowOutputFormat.cpp index 965a339887d..304e877aae9 100644 --- a/src/Processors/Formats/Impl/CSVRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowOutputFormat.cpp @@ -56,7 +56,7 @@ void CSVRowOutputFormat::writeFieldDelimiter() void CSVRowOutputFormat::writeRowEndDelimiter() { - if (format_settings.csv.crlf_end_of_line_for_output) + if (format_settings.csv.crlf_end_of_line) writeChar('\r', out); writeChar('\n', out); } diff --git a/tests/queries/0_stateless/02891_input_csv_cr_end_of_line.reference b/tests/queries/0_stateless/02891_input_csv_cr_end_of_line.reference new file mode 100644 index 00000000000..9a9f63dc0a5 --- /dev/null +++ b/tests/queries/0_stateless/02891_input_csv_cr_end_of_line.reference @@ -0,0 +1,3 @@ +A 110 208819249 +B 112 208819248 +C 123 783434434 diff --git a/tests/queries/0_stateless/02891_input_csv_cr_end_of_line.sh b/tests/queries/0_stateless/02891_input_csv_cr_end_of_line.sh new file mode 100644 index 00000000000..1e2f647fae3 --- /dev/null +++ b/tests/queries/0_stateless/02891_input_csv_cr_end_of_line.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +# NOTE: this sh wrapper is required because of shell_config + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "drop table if exists test_tbl" +$CLICKHOUSE_CLIENT -q "create table test_tbl (a String, b String, c String) engine=MergeTree order by a" +cat $CURDIR/data_csv/csv_with_cr_end_of_line.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_tbl SETTINGS input_format_csv_allow_cr_end_of_line=true FORMAT CSV" +$CLICKHOUSE_CLIENT -q "select * from test_tbl" +$CLICKHOUSE_CLIENT -q "drop table test_tbl" \ No newline at end of file diff --git a/tests/queries/0_stateless/data_csv/csv_with_cr_end_of_line.csv b/tests/queries/0_stateless/data_csv/csv_with_cr_end_of_line.csv new file mode 100644 index 00000000000..077ca2c84c5 --- /dev/null +++ b/tests/queries/0_stateless/data_csv/csv_with_cr_end_of_line.csv @@ -0,0 +1,2 @@ +A,110,208819249 +B,112,208819248 C,123,783434434 From 91f17cc2683ee77770c27c9bb46533ba84abee48 Mon Sep 17 00:00:00 2001 From: kevinyhzou Date: Mon, 6 Nov 2023 14:09:12 +0800 Subject: [PATCH 3/4] ci fix --- tests/queries/0_stateless/02891_input_csv_cr_end_of_line.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 tests/queries/0_stateless/02891_input_csv_cr_end_of_line.sh diff --git a/tests/queries/0_stateless/02891_input_csv_cr_end_of_line.sh b/tests/queries/0_stateless/02891_input_csv_cr_end_of_line.sh old mode 100644 new mode 100755 From d1743e08f7c1ec93a22d66e7ba08814de0a4c63e Mon Sep 17 00:00:00 2001 From: kevinyhzou Date: Tue, 7 Nov 2023 09:45:46 +0800 Subject: [PATCH 4/4] ci fix --- src/IO/ReadHelpers.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index 9b9374ff05a..19750906fdb 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -835,7 +835,7 @@ void readCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV & /// Check for single '\r' not followed by '\n' /// We should not stop in this case. - if (*buf.position() == '\r') + if (*buf.position() == '\r' && !settings.allow_cr_end_of_line) { ++buf.position(); if (!buf.eof() && *buf.position() != '\n')