Merge pull request #15685 from vivarum/enable-parsing-of-input-enum-values-by-id-10682

Enable parsing enum values by their ids for CSV, TSV and JSON input formats
This commit is contained in:
alexey-milovidov 2020-10-20 22:52:57 +03:00 committed by GitHub
commit 8998829f66
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 109 additions and 17 deletions

View File

@ -305,6 +305,10 @@ When enabled, replace empty input fields in TSV with default values. For complex
Disabled by default.
## input_format_tsv_enum_as_number {#settings-input_format_tsv_enum_as_number}
For TSV input format switches to parsing enum values as enum ids.
## input_format_null_as_default {#settings-input-format-null-as-default}
Enables or disables using default values if input data contain `NULL`, but the data type of the corresponding column in not `Nullable(T)` (for text input formats).
@ -1161,6 +1165,10 @@ The character is interpreted as a delimiter in the CSV data. By default, the del
For CSV input format enables or disables parsing of unquoted `NULL` as literal (synonym for `\N`).
## input_format_csv_enum_as_number {#settings-input_format_csv_enum_as_number}
For CSV input format switches to parsing enum values as enum ids.
## output_format_csv_crlf_end_of_line {#settings-output-format-csv-crlf-end-of-line}
Use DOS/Windows-style line separator (CRLF) in CSV instead of Unix style (LF).

View File

@ -411,12 +411,14 @@ class IColumn;
M(Bool, format_csv_allow_double_quotes, 1, "If it is set to true, allow strings in double quotes.", 0) \
M(Bool, output_format_csv_crlf_end_of_line, false, "If it is set true, end of line in CSV format will be \\r\\n instead of \\n.", 0) \
M(Bool, input_format_csv_unquoted_null_literal_as_null, false, "Consider unquoted NULL literal as \\N", 0) \
M(Bool, input_format_csv_enum_as_number, false, "Treat inserted enum values in CSV formats as enum indices \\N", 0) \
M(Bool, input_format_skip_unknown_fields, false, "Skip columns with unknown names from input data (it works for JSONEachRow, CSVWithNames, TSVWithNames and TSKV formats).", 0) \
M(Bool, input_format_with_names_use_header, true, "For TSVWithNames and CSVWithNames input formats this controls whether format parser is to assume that column data appear in the input exactly as they are specified in the header.", 0) \
M(Bool, input_format_import_nested_json, false, "Map nested JSON data to nested tables (it works for JSONEachRow format).", 0) \
M(Bool, optimize_aggregators_of_group_by_keys, true, "Eliminates min/max/any/anyLast aggregators of GROUP BY keys in SELECT section", 0) \
M(Bool, input_format_defaults_for_omitted_fields, true, "For input data calculate default expressions for omitted fields (it works for JSONEachRow, CSV and TSV formats).", IMPORTANT) \
M(Bool, input_format_tsv_empty_as_default, false, "Treat empty fields in TSV input as default values.", 0) \
M(Bool, input_format_tsv_enum_as_number, false, "Treat inserted enum values in TSV formats as enum indices \\N", 0) \
M(Bool, input_format_null_as_default, false, "For text input formats initialize null fields with default values if data type of this field is not nullable", 0) \
\
M(DateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic' and 'best_effort'.", 0) \

View File

@ -146,12 +146,17 @@ void DataTypeEnum<Type>::serializeTextEscaped(const IColumn & column, size_t row
}
template <typename Type>
void DataTypeEnum<Type>::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
void DataTypeEnum<Type>::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
/// NOTE It would be nice to do without creating a temporary object - at least extract std::string out.
std::string field_name;
readEscapedString(field_name, istr);
assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name)));
if (settings.tsv.input_format_enum_as_number)
assert_cast<ColumnType &>(column).getData().push_back(readValue(istr));
else
{
/// NOTE It would be nice to do without creating a temporary object - at least extract std::string out.
std::string field_name;
readEscapedString(field_name, istr);
assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name)));
}
}
template <typename Type>
@ -169,11 +174,16 @@ void DataTypeEnum<Type>::deserializeTextQuoted(IColumn & column, ReadBuffer & is
}
template <typename Type>
void DataTypeEnum<Type>::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
void DataTypeEnum<Type>::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
std::string field_name;
readString(field_name, istr);
assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name)));
if (settings.tsv.input_format_enum_as_number)
assert_cast<ColumnType &>(column).getData().push_back(readValue(istr));
else
{
std::string field_name;
readString(field_name, istr);
assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name)));
}
}
template <typename Type>
@ -191,9 +201,14 @@ void DataTypeEnum<Type>::serializeTextXML(const IColumn & column, size_t row_num
template <typename Type>
void DataTypeEnum<Type>::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
{
std::string field_name;
readJSONString(field_name, istr);
assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name)));
if (!istr.eof() && *istr.position() != '"')
assert_cast<ColumnType &>(column).getData().push_back(readValue(istr));
else
{
std::string field_name;
readJSONString(field_name, istr);
assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name)));
}
}
template <typename Type>
@ -205,9 +220,14 @@ void DataTypeEnum<Type>::serializeTextCSV(const IColumn & column, size_t row_num
template <typename Type>
void DataTypeEnum<Type>::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
std::string field_name;
readCSVString(field_name, istr, settings.csv);
assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name)));
if (settings.csv.input_format_enum_as_number)
assert_cast<ColumnType &>(column).getData().push_back(readValue(istr));
else
{
std::string field_name;
readCSVString(field_name, istr, settings.csv);
assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name)));
}
}
template <typename Type>

View File

@ -66,13 +66,18 @@ public:
TypeIndex getTypeId() const override { return sizeof(FieldType) == 1 ? TypeIndex::Enum8 : TypeIndex::Enum16; }
const StringRef & getNameForValue(const FieldType & value) const
auto findByValue(const FieldType & value) const
{
const auto it = value_to_name_map.find(value);
if (it == std::end(value_to_name_map))
throw Exception{"Unexpected value " + toString(value) + " for type " + getName(), ErrorCodes::BAD_ARGUMENTS};
return it->second;
return it;
}
const StringRef & getNameForValue(const FieldType & value) const
{
return findByValue(value)->second;
}
FieldType getValue(StringRef field_name) const
@ -84,6 +89,13 @@ public:
return it->getMapped();
}
FieldType readValue(ReadBuffer & istr) const
{
FieldType x;
readText(x, istr);
return findByValue(x)->first;
}
Field castToName(const Field & value_or_name) const override;
Field castToValue(const Field & value_or_name) const override;

View File

@ -49,6 +49,7 @@ static FormatSettings getInputFormatSetting(const Settings & settings, const Con
format_settings.csv.allow_double_quotes = settings.format_csv_allow_double_quotes;
format_settings.csv.unquoted_null_literal_as_null = settings.input_format_csv_unquoted_null_literal_as_null;
format_settings.csv.empty_as_default = settings.input_format_defaults_for_omitted_fields;
format_settings.csv.input_format_enum_as_number = settings.input_format_csv_enum_as_number;
format_settings.null_as_default = settings.input_format_null_as_default;
format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions;
format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions;
@ -63,6 +64,7 @@ static FormatSettings getInputFormatSetting(const Settings & settings, const Con
format_settings.template_settings.row_format = settings.format_template_row;
format_settings.template_settings.row_between_delimiter = settings.format_template_rows_between_delimiter;
format_settings.tsv.empty_as_default = settings.input_format_tsv_empty_as_default;
format_settings.tsv.input_format_enum_as_number = settings.input_format_tsv_enum_as_number;
format_settings.schema.format_schema = settings.format_schema;
format_settings.schema.format_schema_path = context.getFormatSchemaPath();
format_settings.schema.is_server = context.hasGlobalContext() && (context.getGlobalContext().getApplicationType() == Context::ApplicationType::SERVER);

View File

@ -34,6 +34,7 @@ struct FormatSettings
bool unquoted_null_literal_as_null = false;
bool empty_as_default = false;
bool crlf_end_of_line = false;
bool input_format_enum_as_number = false;
};
CSV csv;
@ -81,6 +82,7 @@ struct FormatSettings
bool empty_as_default = false;
bool crlf_end_of_line = false;
String null_representation = "\\N";
bool input_format_enum_as_number = false;
};
TSV tsv;

View File

@ -0,0 +1,15 @@
DROP TABLE IF EXISTS table_with_enum_column_for_csv_insert;
CREATE TABLE table_with_enum_column_for_csv_insert (
Id Int32,
Value Enum('ef' = 1, 'es' = 2)
) ENGINE=Memory();
SET input_format_csv_enum_as_number = 1;
INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 102,2
SELECT * FROM table_with_enum_column_for_csv_insert;
SET input_format_csv_enum_as_number = 0;
DROP TABLE IF EXISTS table_with_enum_column_for_csv_insert;

View File

@ -0,0 +1,11 @@
DROP TABLE IF EXISTS table_with_enum_column_for_json_insert;
CREATE TABLE table_with_enum_column_for_json_insert (
Id Int32,
Value Enum('ef' = 1, 'es' = 2)
) ENGINE=Memory();
INSERT INTO table_with_enum_column_for_json_insert FORMAT JSONEachRow {"Id":102,"Value":2}
SELECT * FROM table_with_enum_column_for_json_insert;
DROP TABLE IF EXISTS table_with_enum_column_for_json_insert;

View File

@ -0,0 +1,2 @@
102 es
103 ef

View File

@ -0,0 +1,16 @@
DROP TABLE IF EXISTS table_with_enum_column_for_tsv_insert;
CREATE TABLE table_with_enum_column_for_tsv_insert (
Id Int32,
Value Enum('ef' = 1, 'es' = 2)
) ENGINE=Memory();
SET input_format_tsv_enum_as_number = 1;
INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 102 2
INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TabSeparatedRaw 103 1
SELECT * FROM table_with_enum_column_for_tsv_insert ORDER BY Id;
SET input_format_tsv_enum_as_number = 0;
DROP TABLE IF EXISTS table_with_enum_column_for_tsv_insert;