mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 16:50:48 +00:00
Merge pull request #15685 from vivarum/enable-parsing-of-input-enum-values-by-id-10682
Enable parsing enum values by their ids for CSV, TSV and JSON input formats
This commit is contained in:
commit
8998829f66
@ -305,6 +305,10 @@ When enabled, replace empty input fields in TSV with default values. For complex
|
||||
|
||||
Disabled by default.
|
||||
|
||||
## input_format_tsv_enum_as_number {#settings-input_format_tsv_enum_as_number}
|
||||
|
||||
For TSV input format switches to parsing enum values as enum ids.
|
||||
|
||||
## input_format_null_as_default {#settings-input-format-null-as-default}
|
||||
|
||||
Enables or disables using default values if input data contain `NULL`, but the data type of the corresponding column in not `Nullable(T)` (for text input formats).
|
||||
@ -1161,6 +1165,10 @@ The character is interpreted as a delimiter in the CSV data. By default, the del
|
||||
|
||||
For CSV input format enables or disables parsing of unquoted `NULL` as literal (synonym for `\N`).
|
||||
|
||||
## input_format_csv_enum_as_number {#settings-input_format_csv_enum_as_number}
|
||||
|
||||
For CSV input format switches to parsing enum values as enum ids.
|
||||
|
||||
## output_format_csv_crlf_end_of_line {#settings-output-format-csv-crlf-end-of-line}
|
||||
|
||||
Use DOS/Windows-style line separator (CRLF) in CSV instead of Unix style (LF).
|
||||
|
@ -411,12 +411,14 @@ class IColumn;
|
||||
M(Bool, format_csv_allow_double_quotes, 1, "If it is set to true, allow strings in double quotes.", 0) \
|
||||
M(Bool, output_format_csv_crlf_end_of_line, false, "If it is set true, end of line in CSV format will be \\r\\n instead of \\n.", 0) \
|
||||
M(Bool, input_format_csv_unquoted_null_literal_as_null, false, "Consider unquoted NULL literal as \\N", 0) \
|
||||
M(Bool, input_format_csv_enum_as_number, false, "Treat inserted enum values in CSV formats as enum indices \\N", 0) \
|
||||
M(Bool, input_format_skip_unknown_fields, false, "Skip columns with unknown names from input data (it works for JSONEachRow, CSVWithNames, TSVWithNames and TSKV formats).", 0) \
|
||||
M(Bool, input_format_with_names_use_header, true, "For TSVWithNames and CSVWithNames input formats this controls whether format parser is to assume that column data appear in the input exactly as they are specified in the header.", 0) \
|
||||
M(Bool, input_format_import_nested_json, false, "Map nested JSON data to nested tables (it works for JSONEachRow format).", 0) \
|
||||
M(Bool, optimize_aggregators_of_group_by_keys, true, "Eliminates min/max/any/anyLast aggregators of GROUP BY keys in SELECT section", 0) \
|
||||
M(Bool, input_format_defaults_for_omitted_fields, true, "For input data calculate default expressions for omitted fields (it works for JSONEachRow, CSV and TSV formats).", IMPORTANT) \
|
||||
M(Bool, input_format_tsv_empty_as_default, false, "Treat empty fields in TSV input as default values.", 0) \
|
||||
M(Bool, input_format_tsv_enum_as_number, false, "Treat inserted enum values in TSV formats as enum indices \\N", 0) \
|
||||
M(Bool, input_format_null_as_default, false, "For text input formats initialize null fields with default values if data type of this field is not nullable", 0) \
|
||||
\
|
||||
M(DateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic' and 'best_effort'.", 0) \
|
||||
|
@ -146,12 +146,17 @@ void DataTypeEnum<Type>::serializeTextEscaped(const IColumn & column, size_t row
|
||||
}
|
||||
|
||||
template <typename Type>
|
||||
void DataTypeEnum<Type>::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
|
||||
void DataTypeEnum<Type>::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
|
||||
{
|
||||
/// NOTE It would be nice to do without creating a temporary object - at least extract std::string out.
|
||||
std::string field_name;
|
||||
readEscapedString(field_name, istr);
|
||||
assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name)));
|
||||
if (settings.tsv.input_format_enum_as_number)
|
||||
assert_cast<ColumnType &>(column).getData().push_back(readValue(istr));
|
||||
else
|
||||
{
|
||||
/// NOTE It would be nice to do without creating a temporary object - at least extract std::string out.
|
||||
std::string field_name;
|
||||
readEscapedString(field_name, istr);
|
||||
assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name)));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Type>
|
||||
@ -169,11 +174,16 @@ void DataTypeEnum<Type>::deserializeTextQuoted(IColumn & column, ReadBuffer & is
|
||||
}
|
||||
|
||||
template <typename Type>
|
||||
void DataTypeEnum<Type>::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
|
||||
void DataTypeEnum<Type>::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
|
||||
{
|
||||
std::string field_name;
|
||||
readString(field_name, istr);
|
||||
assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name)));
|
||||
if (settings.tsv.input_format_enum_as_number)
|
||||
assert_cast<ColumnType &>(column).getData().push_back(readValue(istr));
|
||||
else
|
||||
{
|
||||
std::string field_name;
|
||||
readString(field_name, istr);
|
||||
assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name)));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Type>
|
||||
@ -191,9 +201,14 @@ void DataTypeEnum<Type>::serializeTextXML(const IColumn & column, size_t row_num
|
||||
template <typename Type>
|
||||
void DataTypeEnum<Type>::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
|
||||
{
|
||||
std::string field_name;
|
||||
readJSONString(field_name, istr);
|
||||
assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name)));
|
||||
if (!istr.eof() && *istr.position() != '"')
|
||||
assert_cast<ColumnType &>(column).getData().push_back(readValue(istr));
|
||||
else
|
||||
{
|
||||
std::string field_name;
|
||||
readJSONString(field_name, istr);
|
||||
assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name)));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Type>
|
||||
@ -205,9 +220,14 @@ void DataTypeEnum<Type>::serializeTextCSV(const IColumn & column, size_t row_num
|
||||
template <typename Type>
|
||||
void DataTypeEnum<Type>::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
|
||||
{
|
||||
std::string field_name;
|
||||
readCSVString(field_name, istr, settings.csv);
|
||||
assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name)));
|
||||
if (settings.csv.input_format_enum_as_number)
|
||||
assert_cast<ColumnType &>(column).getData().push_back(readValue(istr));
|
||||
else
|
||||
{
|
||||
std::string field_name;
|
||||
readCSVString(field_name, istr, settings.csv);
|
||||
assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name)));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Type>
|
||||
|
@ -66,13 +66,18 @@ public:
|
||||
|
||||
TypeIndex getTypeId() const override { return sizeof(FieldType) == 1 ? TypeIndex::Enum8 : TypeIndex::Enum16; }
|
||||
|
||||
const StringRef & getNameForValue(const FieldType & value) const
|
||||
auto findByValue(const FieldType & value) const
|
||||
{
|
||||
const auto it = value_to_name_map.find(value);
|
||||
if (it == std::end(value_to_name_map))
|
||||
throw Exception{"Unexpected value " + toString(value) + " for type " + getName(), ErrorCodes::BAD_ARGUMENTS};
|
||||
|
||||
return it->second;
|
||||
return it;
|
||||
}
|
||||
|
||||
const StringRef & getNameForValue(const FieldType & value) const
|
||||
{
|
||||
return findByValue(value)->second;
|
||||
}
|
||||
|
||||
FieldType getValue(StringRef field_name) const
|
||||
@ -84,6 +89,13 @@ public:
|
||||
return it->getMapped();
|
||||
}
|
||||
|
||||
FieldType readValue(ReadBuffer & istr) const
|
||||
{
|
||||
FieldType x;
|
||||
readText(x, istr);
|
||||
return findByValue(x)->first;
|
||||
}
|
||||
|
||||
Field castToName(const Field & value_or_name) const override;
|
||||
Field castToValue(const Field & value_or_name) const override;
|
||||
|
||||
|
@ -49,6 +49,7 @@ static FormatSettings getInputFormatSetting(const Settings & settings, const Con
|
||||
format_settings.csv.allow_double_quotes = settings.format_csv_allow_double_quotes;
|
||||
format_settings.csv.unquoted_null_literal_as_null = settings.input_format_csv_unquoted_null_literal_as_null;
|
||||
format_settings.csv.empty_as_default = settings.input_format_defaults_for_omitted_fields;
|
||||
format_settings.csv.input_format_enum_as_number = settings.input_format_csv_enum_as_number;
|
||||
format_settings.null_as_default = settings.input_format_null_as_default;
|
||||
format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions;
|
||||
format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions;
|
||||
@ -63,6 +64,7 @@ static FormatSettings getInputFormatSetting(const Settings & settings, const Con
|
||||
format_settings.template_settings.row_format = settings.format_template_row;
|
||||
format_settings.template_settings.row_between_delimiter = settings.format_template_rows_between_delimiter;
|
||||
format_settings.tsv.empty_as_default = settings.input_format_tsv_empty_as_default;
|
||||
format_settings.tsv.input_format_enum_as_number = settings.input_format_tsv_enum_as_number;
|
||||
format_settings.schema.format_schema = settings.format_schema;
|
||||
format_settings.schema.format_schema_path = context.getFormatSchemaPath();
|
||||
format_settings.schema.is_server = context.hasGlobalContext() && (context.getGlobalContext().getApplicationType() == Context::ApplicationType::SERVER);
|
||||
|
@ -34,6 +34,7 @@ struct FormatSettings
|
||||
bool unquoted_null_literal_as_null = false;
|
||||
bool empty_as_default = false;
|
||||
bool crlf_end_of_line = false;
|
||||
bool input_format_enum_as_number = false;
|
||||
};
|
||||
|
||||
CSV csv;
|
||||
@ -81,6 +82,7 @@ struct FormatSettings
|
||||
bool empty_as_default = false;
|
||||
bool crlf_end_of_line = false;
|
||||
String null_representation = "\\N";
|
||||
bool input_format_enum_as_number = false;
|
||||
};
|
||||
|
||||
TSV tsv;
|
||||
|
@ -0,0 +1 @@
|
||||
102 es
|
@ -0,0 +1,15 @@
|
||||
DROP TABLE IF EXISTS table_with_enum_column_for_csv_insert;
|
||||
|
||||
CREATE TABLE table_with_enum_column_for_csv_insert (
|
||||
Id Int32,
|
||||
Value Enum('ef' = 1, 'es' = 2)
|
||||
) ENGINE=Memory();
|
||||
|
||||
SET input_format_csv_enum_as_number = 1;
|
||||
|
||||
INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 102,2
|
||||
SELECT * FROM table_with_enum_column_for_csv_insert;
|
||||
|
||||
SET input_format_csv_enum_as_number = 0;
|
||||
|
||||
DROP TABLE IF EXISTS table_with_enum_column_for_csv_insert;
|
@ -0,0 +1 @@
|
||||
102 es
|
@ -0,0 +1,11 @@
|
||||
DROP TABLE IF EXISTS table_with_enum_column_for_json_insert;
|
||||
|
||||
CREATE TABLE table_with_enum_column_for_json_insert (
|
||||
Id Int32,
|
||||
Value Enum('ef' = 1, 'es' = 2)
|
||||
) ENGINE=Memory();
|
||||
|
||||
INSERT INTO table_with_enum_column_for_json_insert FORMAT JSONEachRow {"Id":102,"Value":2}
|
||||
SELECT * FROM table_with_enum_column_for_json_insert;
|
||||
|
||||
DROP TABLE IF EXISTS table_with_enum_column_for_json_insert;
|
@ -0,0 +1,2 @@
|
||||
102 es
|
||||
103 ef
|
@ -0,0 +1,16 @@
|
||||
DROP TABLE IF EXISTS table_with_enum_column_for_tsv_insert;
|
||||
|
||||
CREATE TABLE table_with_enum_column_for_tsv_insert (
|
||||
Id Int32,
|
||||
Value Enum('ef' = 1, 'es' = 2)
|
||||
) ENGINE=Memory();
|
||||
|
||||
SET input_format_tsv_enum_as_number = 1;
|
||||
|
||||
INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 102 2
|
||||
INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TabSeparatedRaw 103 1
|
||||
SELECT * FROM table_with_enum_column_for_tsv_insert ORDER BY Id;
|
||||
|
||||
SET input_format_tsv_enum_as_number = 0;
|
||||
|
||||
DROP TABLE IF EXISTS table_with_enum_column_for_tsv_insert;
|
Loading…
Reference in New Issue
Block a user