support defaults_for_omitted_fields in TSV

This commit is contained in:
Alexander Tokmakov 2019-10-04 22:44:48 +03:00
parent d936cee836
commit a90d126b93
7 changed files with 33 additions and 13 deletions

View File

@ -176,7 +176,7 @@ struct Settings : public SettingsCollection<Settings>
M(SettingBool, input_format_skip_unknown_fields, false, "Skip columns with unknown names from input data (it works for JSONEachRow, CSVWithNames, TSVWithNames and TSKV formats).") \
M(SettingBool, input_format_with_names_use_header, false, "For TSVWithNames and CSVWithNames input formats this controls whether format parser is to assume that column data appear in the input exactly as they are specified in the header.") \
M(SettingBool, input_format_import_nested_json, false, "Map nested JSON data to nested tables (it works for JSONEachRow format).") \
M(SettingBool, input_format_defaults_for_omitted_fields, true, "For input data calculate default expressions for omitted fields (it works for JSONEachRow format).") \
M(SettingBool, input_format_defaults_for_omitted_fields, true, "For input data calculate default expressions for omitted fields (it works for JSONEachRow, CSV and TSV formats).") \
M(SettingBool, input_format_null_as_default, false, "For text input format initialize null fields with default values if data type of this field is not nullable") \
\
M(SettingBool, input_format_values_interpret_expressions, true, "For Values format: if field could not be parsed by streaming parser, run SQL parser and try to interpret it as SQL expression.") \

View File

@ -53,6 +53,7 @@ static FormatSettings getInputFormatSetting(const Settings & settings)
format_settings.template_settings.resultset_format = settings.format_template_resultset;
format_settings.template_settings.row_format = settings.format_template_row;
format_settings.template_settings.row_between_delimiter = settings.format_template_rows_between_delimiter;
format_settings.tsv.empty_as_default = settings.input_format_defaults_for_omitted_fields;
return format_settings;
}

View File

@ -60,6 +60,13 @@ struct FormatSettings
Template template_settings;
struct TSV
{
bool empty_as_default = false;
};
TSV tsv;
bool skip_unknown_fields = false;
bool with_names_use_header = false;
bool write_statistics = true;

View File

@ -180,16 +180,11 @@ bool TabSeparatedRowInputFormat::readRow(MutableColumns & columns, RowReadExtens
for (size_t file_column = 0; file_column < column_indexes_for_input_fields.size(); ++file_column)
{
const auto & column_index = column_indexes_for_input_fields[file_column];
const bool is_last_file_column = file_column + 1 == column_indexes_for_input_fields.size();
if (column_index)
{
const auto & type = data_types[*column_index];
if (format_settings.null_as_default && !type->isNullable())
ext.read_columns[*column_index] = DataTypeNullable::deserializeTextEscaped(*columns[*column_index], in, format_settings, type);
else
{
type->deserializeAsTextEscaped(*columns[*column_index], in, format_settings);
ext.read_columns[*column_index] = true;
}
ext.read_columns[*column_index] = readField(*columns[*column_index], type, is_last_file_column);
}
else
{
@ -216,6 +211,22 @@ bool TabSeparatedRowInputFormat::readRow(MutableColumns & columns, RowReadExtens
return true;
}
bool TabSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column)
{
const bool at_delimiter = !in.eof() && *in.position() == '\t';
const bool at_last_column_line_end = is_last_file_column && (in.eof() || *in.position() == '\n');
if (format_settings.tsv.empty_as_default && (at_delimiter || at_last_column_line_end))
{
column.insertDefault();
return false;
}
else if (format_settings.null_as_default && !type->isNullable())
return DataTypeNullable::deserializeTextCSV(column, in, format_settings, type);
type->deserializeAsTextCSV(column, in, format_settings);
return true;
}
bool TabSeparatedRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out)
{
for (size_t file_column = 0; file_column < column_indexes_for_input_fields.size(); ++file_column)
@ -314,10 +325,8 @@ void TabSeparatedRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, I
prev_pos = in.position();
if (column_indexes_for_input_fields[file_column])
{
if (format_settings.null_as_default && !type->isNullable())
DataTypeNullable::deserializeTextEscaped(column, in, format_settings, type);
else
type->deserializeAsTextEscaped(column, in, format_settings);
const bool is_last_file_column = file_column + 1 == column_indexes_for_input_fields.size();
readField(column, type, is_last_file_column);
}
else
{

View File

@ -41,6 +41,8 @@ private:
std::vector<UInt8> read_columns;
std::vector<size_t> columns_to_fill_with_default_values;
bool readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column);
void addInputColumn(const String & column_name);
void setupAllColumnsByTableSchema();
void fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension& ext);

View File

@ -227,7 +227,7 @@ Enabled by default.
## input_format_defaults_for_omitted_fields {#session_settings-input_format_defaults_for_omitted_fields}
When performing `INSERT` queries, replace omitted input column values with default values of the respective columns. This option only applies to [JSONEachRow](../../interfaces/formats.md#jsoneachrow) and [CSV](../../interfaces/formats.md#csv) formats.
When performing `INSERT` queries, replace omitted input column values with default values of the respective columns. This option only applies to [JSONEachRow](../../interfaces/formats.md#jsoneachrow), [CSV](../../interfaces/formats.md#csv) and [TabSeparated](../../interfaces/formats.md#tabseparated) formats.
!!! note "Note"
When this option is enabled, extended table metadata are sent from server to client. It consumes additional computing resources on the server and can reduce performance.

View File

@ -207,6 +207,7 @@ Ok.
- [JSONEachRow](../../interfaces/formats.md#jsoneachrow)
- [CSV](../../interfaces/formats.md#csv)
- [TabSeparated](../../interfaces/formats.md#tabseparated)
!!! note "Примечание"
Когда опция включена, сервер отправляет клиенту расширенные метаданные. Это требует дополнительных вычислительных ресурсов на сервере и может снизить производительность.