From a90d126b937d5504b244017fc674fd1500374baf Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 4 Oct 2019 22:44:48 +0300 Subject: [PATCH] support defaults_for_omitted_fields in TSV --- dbms/src/Core/Settings.h | 2 +- dbms/src/Formats/FormatFactory.cpp | 1 + dbms/src/Formats/FormatSettings.h | 7 +++++ .../Impl/TabSeparatedRowInputFormat.cpp | 31 ++++++++++++------- .../Formats/Impl/TabSeparatedRowInputFormat.h | 2 ++ docs/en/operations/settings/settings.md | 2 +- docs/ru/operations/settings/settings.md | 1 + 7 files changed, 33 insertions(+), 13 deletions(-) diff --git a/dbms/src/Core/Settings.h b/dbms/src/Core/Settings.h index 27133bdff98..f56a39510e0 100644 --- a/dbms/src/Core/Settings.h +++ b/dbms/src/Core/Settings.h @@ -176,7 +176,7 @@ struct Settings : public SettingsCollection M(SettingBool, input_format_skip_unknown_fields, false, "Skip columns with unknown names from input data (it works for JSONEachRow, CSVWithNames, TSVWithNames and TSKV formats).") \ M(SettingBool, input_format_with_names_use_header, false, "For TSVWithNames and CSVWithNames input formats this controls whether format parser is to assume that column data appear in the input exactly as they are specified in the header.") \ M(SettingBool, input_format_import_nested_json, false, "Map nested JSON data to nested tables (it works for JSONEachRow format).") \ - M(SettingBool, input_format_defaults_for_omitted_fields, true, "For input data calculate default expressions for omitted fields (it works for JSONEachRow format).") \ + M(SettingBool, input_format_defaults_for_omitted_fields, true, "For input data calculate default expressions for omitted fields (it works for JSONEachRow, CSV and TSV formats).") \ M(SettingBool, input_format_null_as_default, false, "For text input format initialize null fields with default values if data type of this field is not nullable") \ \ M(SettingBool, input_format_values_interpret_expressions, true, "For Values format: if field could not be parsed by streaming parser, run SQL parser and try to interpret it as SQL expression.") \ diff --git a/dbms/src/Formats/FormatFactory.cpp b/dbms/src/Formats/FormatFactory.cpp index 39e37c0ff8c..f9c55a32de6 100644 --- a/dbms/src/Formats/FormatFactory.cpp +++ b/dbms/src/Formats/FormatFactory.cpp @@ -53,6 +53,7 @@ static FormatSettings getInputFormatSetting(const Settings & settings) format_settings.template_settings.resultset_format = settings.format_template_resultset; format_settings.template_settings.row_format = settings.format_template_row; format_settings.template_settings.row_between_delimiter = settings.format_template_rows_between_delimiter; + format_settings.tsv.empty_as_default = settings.input_format_defaults_for_omitted_fields; return format_settings; } diff --git a/dbms/src/Formats/FormatSettings.h b/dbms/src/Formats/FormatSettings.h index 3face6f5cef..dfd5d5b86f6 100644 --- a/dbms/src/Formats/FormatSettings.h +++ b/dbms/src/Formats/FormatSettings.h @@ -60,6 +60,13 @@ struct FormatSettings Template template_settings; + struct TSV + { + bool empty_as_default = false; + }; + + TSV tsv; + bool skip_unknown_fields = false; bool with_names_use_header = false; bool write_statistics = true; diff --git a/dbms/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index ef95f9f2ae1..8aa24e35aec 100644 --- a/dbms/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -180,16 +180,11 @@ bool TabSeparatedRowInputFormat::readRow(MutableColumns & columns, RowReadExtens for (size_t file_column = 0; file_column < column_indexes_for_input_fields.size(); ++file_column) { const auto & column_index = column_indexes_for_input_fields[file_column]; + const bool is_last_file_column = file_column + 1 == column_indexes_for_input_fields.size(); if (column_index) { const auto & type = data_types[*column_index]; - if (format_settings.null_as_default && !type->isNullable()) - ext.read_columns[*column_index] = DataTypeNullable::deserializeTextEscaped(*columns[*column_index], in, format_settings, type); - else - { - type->deserializeAsTextEscaped(*columns[*column_index], in, format_settings); - ext.read_columns[*column_index] = true; - } + ext.read_columns[*column_index] = readField(*columns[*column_index], type, is_last_file_column); } else { @@ -216,6 +211,22 @@ bool TabSeparatedRowInputFormat::readRow(MutableColumns & columns, RowReadExtens return true; } + +bool TabSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column) +{ + const bool at_delimiter = !in.eof() && *in.position() == '\t'; + const bool at_last_column_line_end = is_last_file_column && (in.eof() || *in.position() == '\n'); + if (format_settings.tsv.empty_as_default && (at_delimiter || at_last_column_line_end)) + { + column.insertDefault(); + return false; + } + else if (format_settings.null_as_default && !type->isNullable()) + return DataTypeNullable::deserializeTextCSV(column, in, format_settings, type); + type->deserializeAsTextCSV(column, in, format_settings); + return true; +} + bool TabSeparatedRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) { for (size_t file_column = 0; file_column < column_indexes_for_input_fields.size(); ++file_column) @@ -314,10 +325,8 @@ void TabSeparatedRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, I prev_pos = in.position(); if (column_indexes_for_input_fields[file_column]) { - if (format_settings.null_as_default && !type->isNullable()) - DataTypeNullable::deserializeTextEscaped(column, in, format_settings, type); - else - type->deserializeAsTextEscaped(column, in, format_settings); + const bool is_last_file_column = file_column + 1 == column_indexes_for_input_fields.size(); + readField(column, type, is_last_file_column); } else { diff --git a/dbms/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h b/dbms/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h index a28ac62ed4f..9d3f0b52d11 100644 --- a/dbms/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h +++ b/dbms/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h @@ -41,6 +41,8 @@ private: std::vector read_columns; std::vector columns_to_fill_with_default_values; + bool readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column); + void addInputColumn(const String & column_name); void setupAllColumnsByTableSchema(); void fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension& ext); diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 85e910e1210..ac21e65d4ec 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -227,7 +227,7 @@ Enabled by default. ## input_format_defaults_for_omitted_fields {#session_settings-input_format_defaults_for_omitted_fields} -When performing `INSERT` queries, replace omitted input column values with default values of the respective columns. This option only applies to [JSONEachRow](../../interfaces/formats.md#jsoneachrow) and [CSV](../../interfaces/formats.md#csv) formats. +When performing `INSERT` queries, replace omitted input column values with default values of the respective columns. This option only applies to [JSONEachRow](../../interfaces/formats.md#jsoneachrow), [CSV](../../interfaces/formats.md#csv) and [TabSeparated](../../interfaces/formats.md#tabseparated) formats. !!! note "Note" When this option is enabled, extended table metadata are sent from server to client. It consumes additional computing resources on the server and can reduce performance. diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index f12fa5eeea6..25c9bca5bcf 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -207,6 +207,7 @@ Ok. - [JSONEachRow](../../interfaces/formats.md#jsoneachrow) - [CSV](../../interfaces/formats.md#csv) +- [TabSeparated](../../interfaces/formats.md#tabseparated) !!! note "Примечание" Когда опция включена, сервер отправляет клиенту расширенные метаданные. Это требует дополнительных вычислительных ресурсов на сервере и может снизить производительность.