Merge pull request #50635 from Avogar/skip-trailing-empty-lines

Allow to skip trailing empty lines in CSV/TSV/CustomSeparated formats
This commit is contained in:
Kruglov Pavel 2023-06-13 12:43:43 +02:00 committed by GitHub
commit cbed327077
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 82 additions and 1 deletions

View File

@ -193,6 +193,7 @@ SELECT * FROM nestedt FORMAT TSV
- [output_format_tsv_crlf_end_of_line](/docs/en/operations/settings/settings-formats.md/#output_format_tsv_crlf_end_of_line) - if it is set true, end of line in TSV output format will be `\r\n` instead of `\n`. Default value - `false`.
- [input_format_tsv_skip_first_lines](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_skip_first_lines) - skip specified number of lines at the beginning of data. Default value - `0`.
- [input_format_tsv_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_detect_header) - automatically detect header with names and types in TSV format. Default value - `true`.
- [input_format_tsv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`.
## TabSeparatedRaw {#tabseparatedraw}
@ -467,6 +468,7 @@ The CSV format supports the output of totals and extremes the same way as `TabSe
- [output_format_csv_crlf_end_of_line](/docs/en/operations/settings/settings-formats.md/#output_format_csv_crlf_end_of_line) - if it is set to true, end of line in CSV output format will be `\r\n` instead of `\n`. Default value - `false`.
- [input_format_csv_skip_first_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_first_lines) - skip the specified number of lines at the beginning of data. Default value - `0`.
- [input_format_csv_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_csv_detect_header) - automatically detect header with names and types in CSV format. Default value - `true`.
- [input_format_csv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`.
- [input_format_csv_trim_whitespaces](/docs/en/operations/settings/settings-formats.md/#input_format_csv_trim_whitespaces) - trim spaces and tabs in non-quoted CSV strings. Default value - `true`.
## CSVWithNames {#csvwithnames}
@ -495,7 +497,9 @@ the types from input data will be compared with the types of the corresponding c
Similar to [Template](#format-template), but it prints or reads all names and types of columns and uses escaping rule from [format_custom_escaping_rule](/docs/en/operations/settings/settings-formats.md/#format_custom_escaping_rule) setting and delimiters from [format_custom_field_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_field_delimiter), [format_custom_row_before_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_row_before_delimiter), [format_custom_row_after_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_row_after_delimiter), [format_custom_row_between_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_row_between_delimiter), [format_custom_result_before_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_result_before_delimiter) and [format_custom_result_after_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_result_after_delimiter) settings, not from format strings.
If setting [input_format_custom_detect_header](/docs/en/operations/settings/settings.md/#input_format_custom_detect_header) is enabled, ClickHouse will automatically detect header with names and types if any.
If setting [input_format_custom_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_custom_detect_header) is enabled, ClickHouse will automatically detect header with names and types if any.
If setting [input_format_tsv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_custom_detect_header) is enabled, trailing empty lines at the end of file will be skipped.
There is also `CustomSeparatedIgnoreSpaces` format, which is similar to [TemplateIgnoreSpaces](#templateignorespaces).

View File

@ -728,6 +728,12 @@ My NULL
My NULL
```
### input_format_tsv_skip_trailing_empty_lines {input_format_tsv_skip_trailing_empty_lines}
When enabled, trailing empty lines at the end of TSV file will be skipped.
Disabled by default.
## CSV format settings {#csv-format-settings}
### format_csv_delimiter {#format_csv_delimiter}
@ -882,6 +888,12 @@ My NULL
My NULL
```
### input_format_csv_skip_trailing_empty_lines {input_format_csv_skip_trailing_empty_lines}
When enabled, trailing empty lines at the end of CSV file will be skipped.
Disabled by default.
### input_format_csv_trim_whitespaces {#input_format_csv_trim_whitespaces}
Trims spaces and tabs in non-quoted CSV strings.
@ -1475,6 +1487,12 @@ Sets the character that is interpreted as a suffix after the result set for [Cus
Default value: `''`.
### input_format_custom_skip_trailing_empty_lines {input_format_custom_skip_trailing_empty_lines}
When enabled, trailing empty lines at the end of file in CustomSeparated format will be skipped.
Disabled by default.
## Regexp format settings {#regexp-format-settings}
### format_regexp_escaping_rule {#format_regexp_escaping_rule}

View File

@ -886,6 +886,9 @@ class IColumn;
M(Bool, output_format_protobuf_nullables_with_google_wrappers, false, "When serializing Nullable columns with Google wrappers, serialize default values as empty wrappers. If turned off, default and null values are not serialized", 0) \
M(UInt64, input_format_csv_skip_first_lines, 0, "Skip specified number of lines at the beginning of data in CSV format", 0) \
M(UInt64, input_format_tsv_skip_first_lines, 0, "Skip specified number of lines at the beginning of data in TSV format", 0) \
M(Bool, input_format_csv_skip_trailing_empty_lines, false, "Skip trailing empty lines in CSV format", 0) \
M(Bool, input_format_tsv_skip_trailing_empty_lines, false, "Skip trailing empty lines in TSV format", 0) \
M(Bool, input_format_custom_skip_trailing_empty_lines, false, "Skip trailing empty lines in CustomSeparated format", 0) \
\
M(Bool, input_format_native_allow_types_conversion, true, "Allow data types conversion in Native input format", 0) \
\

View File

@ -69,6 +69,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.csv.use_best_effort_in_schema_inference = settings.input_format_csv_use_best_effort_in_schema_inference;
format_settings.csv.skip_first_lines = settings.input_format_csv_skip_first_lines;
format_settings.csv.try_detect_header = settings.input_format_csv_detect_header;
format_settings.csv.skip_trailing_empty_lines = settings.input_format_csv_skip_trailing_empty_lines;
format_settings.csv.trim_whitespaces = settings.input_format_csv_trim_whitespaces;
format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
@ -81,6 +82,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.custom.row_before_delimiter = settings.format_custom_row_before_delimiter;
format_settings.custom.row_between_delimiter = settings.format_custom_row_between_delimiter;
format_settings.custom.try_detect_header = settings.input_format_custom_detect_header;
format_settings.custom.skip_trailing_empty_lines = settings.input_format_custom_skip_trailing_empty_lines;
format_settings.date_time_input_format = settings.date_time_input_format;
format_settings.date_time_output_format = settings.date_time_output_format;
format_settings.input_format_ipv4_default_on_conversion_error = settings.input_format_ipv4_default_on_conversion_error;
@ -150,6 +152,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.tsv.use_best_effort_in_schema_inference = settings.input_format_tsv_use_best_effort_in_schema_inference;
format_settings.tsv.skip_first_lines = settings.input_format_tsv_skip_first_lines;
format_settings.tsv.try_detect_header = settings.input_format_tsv_detect_header;
format_settings.tsv.skip_trailing_empty_lines = settings.input_format_tsv_skip_trailing_empty_lines;
format_settings.values.accurate_types_of_literals = settings.input_format_values_accurate_types_of_literals;
format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions;
format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions;

View File

@ -136,6 +136,7 @@ struct FormatSettings
UInt64 skip_first_lines = 0;
String custom_delimiter;
bool try_detect_header = true;
bool skip_trailing_empty_lines = false;
bool trim_whitespaces = true;
} csv;
@ -157,6 +158,7 @@ struct FormatSettings
std::string field_delimiter;
EscapingRule escaping_rule = EscapingRule::Escaped;
bool try_detect_header = true;
bool skip_trailing_empty_lines = false;
} custom;
struct
@ -292,6 +294,7 @@ struct FormatSettings
bool use_best_effort_in_schema_inference = true;
UInt64 skip_first_lines = 0;
bool try_detect_header = true;
bool skip_trailing_empty_lines = false;
} tsv;
struct

View File

@ -325,6 +325,20 @@ void CSVFormatReader::setReadBuffer(ReadBuffer & in_)
FormatWithNamesAndTypesReader::setReadBuffer(*buf);
}
bool CSVFormatReader::checkForSuffix()
{
if (!format_settings.csv.skip_trailing_empty_lines)
return buf->eof();
PeekableReadBufferCheckpoint checkpoint(*buf);
while (checkChar('\n', *buf) || checkChar('\r', *buf));
if (buf->eof())
return true;
buf->rollbackToCheckpoint();
return false;
}
CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_settings_)
: FormatWithNamesAndTypesSchemaReader(
buf,

View File

@ -75,6 +75,7 @@ public:
std::vector<String> readRow() { return readRowImpl<false>(); }
std::vector<String> readRowForHeaderDetection() override { return readHeaderRow(); }
bool checkForSuffix() override;
template <bool is_header>
std::vector<String> readRowImpl();

View File

@ -283,6 +283,8 @@ bool CustomSeparatedFormatReader::checkForSuffixImpl(bool check_eof)
/// Allow optional \n before eof.
checkChar('\n', *buf);
if (format_settings.custom.skip_trailing_empty_lines)
while (checkChar('\n', *buf) || checkChar('\r', *buf));
return buf->eof();
}
@ -294,6 +296,8 @@ bool CustomSeparatedFormatReader::checkForSuffixImpl(bool check_eof)
/// Allow optional \n before eof.
checkChar('\n', *buf);
if (format_settings.custom.skip_trailing_empty_lines)
while (checkChar('\n', *buf) || checkChar('\r', *buf));
if (buf->eof())
return true;
}

View File

@ -286,6 +286,20 @@ void TabSeparatedFormatReader::setReadBuffer(ReadBuffer & in_)
FormatWithNamesAndTypesReader::setReadBuffer(*buf);
}
bool TabSeparatedFormatReader::checkForSuffix()
{
if (!format_settings.tsv.skip_trailing_empty_lines)
return buf->eof();
PeekableReadBufferCheckpoint checkpoint(*buf);
while (checkChar('\n', *buf) || checkChar('\r', *buf));
if (buf->eof())
return true;
buf->rollbackToCheckpoint();
return false;
}
TabSeparatedSchemaReader::TabSeparatedSchemaReader(
ReadBuffer & in_, bool with_names_, bool with_types_, bool is_raw_, const FormatSettings & format_settings_)
: FormatWithNamesAndTypesSchemaReader(

View File

@ -75,6 +75,8 @@ public:
void setReadBuffer(ReadBuffer & in_) override;
bool checkForSuffix() override;
private:
template <bool is_header>
std::vector<String> readRowImpl();

View File

@ -0,0 +1,12 @@
select * from format(TSV, 'x UInt32, y UInt32', '1\t2\n\n') settings input_format_tsv_skip_trailing_empty_lines=0; -- {serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED}
select * from format(TSV, 'x UInt32, y UInt32', '1\t2\n\n') settings input_format_tsv_skip_trailing_empty_lines=1;
select * from format(TSV, 'x UInt32, y UInt32', '1\t2\n\n1\t2\n') settings input_format_tsv_skip_trailing_empty_lines=1; -- {serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED}
select * from format(CSV, 'x UInt32, y UInt32', '1,2\n\n') settings input_format_csv_skip_trailing_empty_lines=0; -- {serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED}
select * from format(CSV, 'x UInt32, y UInt32', '1,2\n\n') settings input_format_csv_skip_trailing_empty_lines=1;
select * from format(CSV, 'x UInt32, y UInt32', '1,2\n\n1,2\n') settings input_format_csv_skip_trailing_empty_lines=1; -- {serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED}
select * from format(CustomSeparated, 'x UInt32, y UInt32', '1\t2\n\n\n') settings input_format_custom_skip_trailing_empty_lines=0; -- {serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED}
select * from format(CustomSeparated, 'x UInt32, y UInt32', '1\t2\n\n\n') settings input_format_custom_skip_trailing_empty_lines=1;
select * from format(CustomSeparated, 'x UInt32, y UInt32', '1\t2\n\n\n1\t2\n\n\n') settings input_format_custom_skip_trailing_empty_lines=1; -- {serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED}