mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-25 09:02:00 +00:00
Merge pull request #50635 from Avogar/skip-trailing-empty-lines
Allow to skip trailing empty lines in CSV/TSV/CustomSeparated formats
This commit is contained in:
commit
cbed327077
@ -193,6 +193,7 @@ SELECT * FROM nestedt FORMAT TSV
|
||||
- [output_format_tsv_crlf_end_of_line](/docs/en/operations/settings/settings-formats.md/#output_format_tsv_crlf_end_of_line) - if it is set true, end of line in TSV output format will be `\r\n` instead of `\n`. Default value - `false`.
|
||||
- [input_format_tsv_skip_first_lines](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_skip_first_lines) - skip specified number of lines at the beginning of data. Default value - `0`.
|
||||
- [input_format_tsv_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_detect_header) - automatically detect header with names and types in TSV format. Default value - `true`.
|
||||
- [input_format_tsv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`.
|
||||
|
||||
## TabSeparatedRaw {#tabseparatedraw}
|
||||
|
||||
@ -467,6 +468,7 @@ The CSV format supports the output of totals and extremes the same way as `TabSe
|
||||
- [output_format_csv_crlf_end_of_line](/docs/en/operations/settings/settings-formats.md/#output_format_csv_crlf_end_of_line) - if it is set to true, end of line in CSV output format will be `\r\n` instead of `\n`. Default value - `false`.
|
||||
- [input_format_csv_skip_first_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_first_lines) - skip the specified number of lines at the beginning of data. Default value - `0`.
|
||||
- [input_format_csv_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_csv_detect_header) - automatically detect header with names and types in CSV format. Default value - `true`.
|
||||
- [input_format_csv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`.
|
||||
- [input_format_csv_trim_whitespaces](/docs/en/operations/settings/settings-formats.md/#input_format_csv_trim_whitespaces) - trim spaces and tabs in non-quoted CSV strings. Default value - `true`.
|
||||
|
||||
## CSVWithNames {#csvwithnames}
|
||||
@ -495,7 +497,9 @@ the types from input data will be compared with the types of the corresponding c
|
||||
|
||||
Similar to [Template](#format-template), but it prints or reads all names and types of columns and uses escaping rule from [format_custom_escaping_rule](/docs/en/operations/settings/settings-formats.md/#format_custom_escaping_rule) setting and delimiters from [format_custom_field_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_field_delimiter), [format_custom_row_before_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_row_before_delimiter), [format_custom_row_after_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_row_after_delimiter), [format_custom_row_between_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_row_between_delimiter), [format_custom_result_before_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_result_before_delimiter) and [format_custom_result_after_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_result_after_delimiter) settings, not from format strings.
|
||||
|
||||
If setting [input_format_custom_detect_header](/docs/en/operations/settings/settings.md/#input_format_custom_detect_header) is enabled, ClickHouse will automatically detect header with names and types if any.
|
||||
If setting [input_format_custom_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_custom_detect_header) is enabled, ClickHouse will automatically detect header with names and types if any.
|
||||
|
||||
If setting [input_format_tsv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_custom_detect_header) is enabled, trailing empty lines at the end of file will be skipped.
|
||||
|
||||
There is also `CustomSeparatedIgnoreSpaces` format, which is similar to [TemplateIgnoreSpaces](#templateignorespaces).
|
||||
|
||||
|
@ -728,6 +728,12 @@ My NULL
|
||||
My NULL
|
||||
```
|
||||
|
||||
### input_format_tsv_skip_trailing_empty_lines {input_format_tsv_skip_trailing_empty_lines}
|
||||
|
||||
When enabled, trailing empty lines at the end of TSV file will be skipped.
|
||||
|
||||
Disabled by default.
|
||||
|
||||
## CSV format settings {#csv-format-settings}
|
||||
|
||||
### format_csv_delimiter {#format_csv_delimiter}
|
||||
@ -882,6 +888,12 @@ My NULL
|
||||
My NULL
|
||||
```
|
||||
|
||||
### input_format_csv_skip_trailing_empty_lines {input_format_csv_skip_trailing_empty_lines}
|
||||
|
||||
When enabled, trailing empty lines at the end of CSV file will be skipped.
|
||||
|
||||
Disabled by default.
|
||||
|
||||
### input_format_csv_trim_whitespaces {#input_format_csv_trim_whitespaces}
|
||||
|
||||
Trims spaces and tabs in non-quoted CSV strings.
|
||||
@ -1475,6 +1487,12 @@ Sets the character that is interpreted as a suffix after the result set for [Cus
|
||||
|
||||
Default value: `''`.
|
||||
|
||||
### input_format_custom_skip_trailing_empty_lines {input_format_custom_skip_trailing_empty_lines}
|
||||
|
||||
When enabled, trailing empty lines at the end of file in CustomSeparated format will be skipped.
|
||||
|
||||
Disabled by default.
|
||||
|
||||
## Regexp format settings {#regexp-format-settings}
|
||||
|
||||
### format_regexp_escaping_rule {#format_regexp_escaping_rule}
|
||||
|
@ -886,6 +886,9 @@ class IColumn;
|
||||
M(Bool, output_format_protobuf_nullables_with_google_wrappers, false, "When serializing Nullable columns with Google wrappers, serialize default values as empty wrappers. If turned off, default and null values are not serialized", 0) \
|
||||
M(UInt64, input_format_csv_skip_first_lines, 0, "Skip specified number of lines at the beginning of data in CSV format", 0) \
|
||||
M(UInt64, input_format_tsv_skip_first_lines, 0, "Skip specified number of lines at the beginning of data in TSV format", 0) \
|
||||
M(Bool, input_format_csv_skip_trailing_empty_lines, false, "Skip trailing empty lines in CSV format", 0) \
|
||||
M(Bool, input_format_tsv_skip_trailing_empty_lines, false, "Skip trailing empty lines in TSV format", 0) \
|
||||
M(Bool, input_format_custom_skip_trailing_empty_lines, false, "Skip trailing empty lines in CustomSeparated format", 0) \
|
||||
\
|
||||
M(Bool, input_format_native_allow_types_conversion, true, "Allow data types conversion in Native input format", 0) \
|
||||
\
|
||||
|
@ -69,6 +69,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.csv.use_best_effort_in_schema_inference = settings.input_format_csv_use_best_effort_in_schema_inference;
|
||||
format_settings.csv.skip_first_lines = settings.input_format_csv_skip_first_lines;
|
||||
format_settings.csv.try_detect_header = settings.input_format_csv_detect_header;
|
||||
format_settings.csv.skip_trailing_empty_lines = settings.input_format_csv_skip_trailing_empty_lines;
|
||||
format_settings.csv.trim_whitespaces = settings.input_format_csv_trim_whitespaces;
|
||||
format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
|
||||
format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
|
||||
@ -81,6 +82,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.custom.row_before_delimiter = settings.format_custom_row_before_delimiter;
|
||||
format_settings.custom.row_between_delimiter = settings.format_custom_row_between_delimiter;
|
||||
format_settings.custom.try_detect_header = settings.input_format_custom_detect_header;
|
||||
format_settings.custom.skip_trailing_empty_lines = settings.input_format_custom_skip_trailing_empty_lines;
|
||||
format_settings.date_time_input_format = settings.date_time_input_format;
|
||||
format_settings.date_time_output_format = settings.date_time_output_format;
|
||||
format_settings.input_format_ipv4_default_on_conversion_error = settings.input_format_ipv4_default_on_conversion_error;
|
||||
@ -150,6 +152,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.tsv.use_best_effort_in_schema_inference = settings.input_format_tsv_use_best_effort_in_schema_inference;
|
||||
format_settings.tsv.skip_first_lines = settings.input_format_tsv_skip_first_lines;
|
||||
format_settings.tsv.try_detect_header = settings.input_format_tsv_detect_header;
|
||||
format_settings.tsv.skip_trailing_empty_lines = settings.input_format_tsv_skip_trailing_empty_lines;
|
||||
format_settings.values.accurate_types_of_literals = settings.input_format_values_accurate_types_of_literals;
|
||||
format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions;
|
||||
format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions;
|
||||
|
@ -136,6 +136,7 @@ struct FormatSettings
|
||||
UInt64 skip_first_lines = 0;
|
||||
String custom_delimiter;
|
||||
bool try_detect_header = true;
|
||||
bool skip_trailing_empty_lines = false;
|
||||
bool trim_whitespaces = true;
|
||||
} csv;
|
||||
|
||||
@ -157,6 +158,7 @@ struct FormatSettings
|
||||
std::string field_delimiter;
|
||||
EscapingRule escaping_rule = EscapingRule::Escaped;
|
||||
bool try_detect_header = true;
|
||||
bool skip_trailing_empty_lines = false;
|
||||
} custom;
|
||||
|
||||
struct
|
||||
@ -292,6 +294,7 @@ struct FormatSettings
|
||||
bool use_best_effort_in_schema_inference = true;
|
||||
UInt64 skip_first_lines = 0;
|
||||
bool try_detect_header = true;
|
||||
bool skip_trailing_empty_lines = false;
|
||||
} tsv;
|
||||
|
||||
struct
|
||||
|
@ -325,6 +325,20 @@ void CSVFormatReader::setReadBuffer(ReadBuffer & in_)
|
||||
FormatWithNamesAndTypesReader::setReadBuffer(*buf);
|
||||
}
|
||||
|
||||
bool CSVFormatReader::checkForSuffix()
|
||||
{
|
||||
if (!format_settings.csv.skip_trailing_empty_lines)
|
||||
return buf->eof();
|
||||
|
||||
PeekableReadBufferCheckpoint checkpoint(*buf);
|
||||
while (checkChar('\n', *buf) || checkChar('\r', *buf));
|
||||
if (buf->eof())
|
||||
return true;
|
||||
|
||||
buf->rollbackToCheckpoint();
|
||||
return false;
|
||||
}
|
||||
|
||||
CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_settings_)
|
||||
: FormatWithNamesAndTypesSchemaReader(
|
||||
buf,
|
||||
|
@ -75,6 +75,7 @@ public:
|
||||
std::vector<String> readRow() { return readRowImpl<false>(); }
|
||||
std::vector<String> readRowForHeaderDetection() override { return readHeaderRow(); }
|
||||
|
||||
bool checkForSuffix() override;
|
||||
|
||||
template <bool is_header>
|
||||
std::vector<String> readRowImpl();
|
||||
|
@ -283,6 +283,8 @@ bool CustomSeparatedFormatReader::checkForSuffixImpl(bool check_eof)
|
||||
|
||||
/// Allow optional \n before eof.
|
||||
checkChar('\n', *buf);
|
||||
if (format_settings.custom.skip_trailing_empty_lines)
|
||||
while (checkChar('\n', *buf) || checkChar('\r', *buf));
|
||||
return buf->eof();
|
||||
}
|
||||
|
||||
@ -294,6 +296,8 @@ bool CustomSeparatedFormatReader::checkForSuffixImpl(bool check_eof)
|
||||
|
||||
/// Allow optional \n before eof.
|
||||
checkChar('\n', *buf);
|
||||
if (format_settings.custom.skip_trailing_empty_lines)
|
||||
while (checkChar('\n', *buf) || checkChar('\r', *buf));
|
||||
if (buf->eof())
|
||||
return true;
|
||||
}
|
||||
|
@ -286,6 +286,20 @@ void TabSeparatedFormatReader::setReadBuffer(ReadBuffer & in_)
|
||||
FormatWithNamesAndTypesReader::setReadBuffer(*buf);
|
||||
}
|
||||
|
||||
bool TabSeparatedFormatReader::checkForSuffix()
|
||||
{
|
||||
if (!format_settings.tsv.skip_trailing_empty_lines)
|
||||
return buf->eof();
|
||||
|
||||
PeekableReadBufferCheckpoint checkpoint(*buf);
|
||||
while (checkChar('\n', *buf) || checkChar('\r', *buf));
|
||||
if (buf->eof())
|
||||
return true;
|
||||
|
||||
buf->rollbackToCheckpoint();
|
||||
return false;
|
||||
}
|
||||
|
||||
TabSeparatedSchemaReader::TabSeparatedSchemaReader(
|
||||
ReadBuffer & in_, bool with_names_, bool with_types_, bool is_raw_, const FormatSettings & format_settings_)
|
||||
: FormatWithNamesAndTypesSchemaReader(
|
||||
|
@ -75,6 +75,8 @@ public:
|
||||
|
||||
void setReadBuffer(ReadBuffer & in_) override;
|
||||
|
||||
bool checkForSuffix() override;
|
||||
|
||||
private:
|
||||
template <bool is_header>
|
||||
std::vector<String> readRowImpl();
|
||||
|
@ -0,0 +1,3 @@
|
||||
1 2
|
||||
1 2
|
||||
1 2
|
@ -0,0 +1,12 @@
|
||||
select * from format(TSV, 'x UInt32, y UInt32', '1\t2\n\n') settings input_format_tsv_skip_trailing_empty_lines=0; -- {serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED}
|
||||
select * from format(TSV, 'x UInt32, y UInt32', '1\t2\n\n') settings input_format_tsv_skip_trailing_empty_lines=1;
|
||||
select * from format(TSV, 'x UInt32, y UInt32', '1\t2\n\n1\t2\n') settings input_format_tsv_skip_trailing_empty_lines=1; -- {serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED}
|
||||
|
||||
select * from format(CSV, 'x UInt32, y UInt32', '1,2\n\n') settings input_format_csv_skip_trailing_empty_lines=0; -- {serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED}
|
||||
select * from format(CSV, 'x UInt32, y UInt32', '1,2\n\n') settings input_format_csv_skip_trailing_empty_lines=1;
|
||||
select * from format(CSV, 'x UInt32, y UInt32', '1,2\n\n1,2\n') settings input_format_csv_skip_trailing_empty_lines=1; -- {serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED}
|
||||
|
||||
select * from format(CustomSeparated, 'x UInt32, y UInt32', '1\t2\n\n\n') settings input_format_custom_skip_trailing_empty_lines=0; -- {serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED}
|
||||
select * from format(CustomSeparated, 'x UInt32, y UInt32', '1\t2\n\n\n') settings input_format_custom_skip_trailing_empty_lines=1;
|
||||
select * from format(CustomSeparated, 'x UInt32, y UInt32', '1\t2\n\n\n1\t2\n\n\n') settings input_format_custom_skip_trailing_empty_lines=1; -- {serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED}
|
||||
|
Loading…
Reference in New Issue
Block a user