Merge pull request #18271 from ClickHouse/arrays-as-nested-csv

Support to parse Arrays in CSV as nested CSV in a String
This commit is contained in:
alexey-milovidov 2020-12-22 09:28:25 +03:00 committed by GitHub
commit adab35003e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 75 additions and 10 deletions

View File

@ -414,7 +414,9 @@ class IColumn;
M(UInt64, multiple_joins_rewriter_version, 0, "Obsolete setting, does nothing. Will be removed after 2021-03-31", 0) \
M(Bool, enable_debug_queries, false, "Enabled debug queries, but now is obsolete", 0) \
M(Bool, allow_experimental_database_atomic, true, "Obsolete setting, does nothing. Will be removed after 2021-02-12", 0) \
M(UnionMode, union_default_mode, UnionMode::DISTINCT, "Set default Union Mode in SelectWithUnion query. Possible values: empty string, 'ALL', 'DISTINCT'. If empty, query without Union Mode will throw exception.", 0)
M(UnionMode, union_default_mode, UnionMode::DISTINCT, "Set default Union Mode in SelectWithUnion query. Possible values: empty string, 'ALL', 'DISTINCT'. If empty, query without Union Mode will throw exception.", 0) \
M(Bool, optimize_aggregators_of_group_by_keys, true, "Eliminates min/max/any/anyLast aggregators of GROUP BY keys in SELECT section", 0) \
M(Bool, optimize_group_by_function_keys, true, "Eliminates functions of other keys in GROUP BY section", 0) \
// End of COMMON_SETTINGS
// Please add settings related to formats into the FORMAT_FACTORY_SETTINGS below.
@ -426,10 +428,10 @@ class IColumn;
M(Bool, output_format_csv_crlf_end_of_line, false, "If it is set true, end of line in CSV format will be \\r\\n instead of \\n.", 0) \
M(Bool, input_format_csv_unquoted_null_literal_as_null, false, "Consider unquoted NULL literal as \\N", 0) \
M(Bool, input_format_csv_enum_as_number, false, "Treat inserted enum values in CSV formats as enum indices \\N", 0) \
M(Bool, input_format_csv_arrays_as_nested_csv, false, R"(When reading Array from CSV, expect that its elements were serialized in nested CSV and then put into string. Example: "[""Hello"", ""world"", ""42"""" TV""]". Braces around array can be omitted.)", 0) \
M(Bool, input_format_skip_unknown_fields, false, "Skip columns with unknown names from input data (it works for JSONEachRow, CSVWithNames, TSVWithNames and TSKV formats).", 0) \
M(Bool, input_format_with_names_use_header, true, "For TSVWithNames and CSVWithNames input formats this controls whether format parser is to assume that column data appear in the input exactly as they are specified in the header.", 0) \
M(Bool, input_format_import_nested_json, false, "Map nested JSON data to nested tables (it works for JSONEachRow format).", 0) \
M(Bool, optimize_aggregators_of_group_by_keys, true, "Eliminates min/max/any/anyLast aggregators of GROUP BY keys in SELECT section", 0) \
M(Bool, input_format_defaults_for_omitted_fields, true, "For input data calculate default expressions for omitted fields (it works for JSONEachRow, CSV and TSV formats).", IMPORTANT) \
M(Bool, input_format_tsv_empty_as_default, false, "Treat empty fields in TSV input as default values.", 0) \
M(Bool, input_format_tsv_enum_as_number, false, "Treat inserted enum values in TSV formats as enum indices \\N", 0) \
@ -438,7 +440,6 @@ class IColumn;
M(DateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic' and 'best_effort'.", 0) \
M(DateTimeOutputFormat, date_time_output_format, FormatSettings::DateTimeOutputFormat::Simple, "Method to write DateTime to text output. Possible values: 'simple', 'iso', 'unix_timestamp'.", 0) \
\
M(Bool, optimize_group_by_function_keys, true, "Eliminates functions of other keys in GROUP BY section", 0) \
M(Bool, input_format_values_interpret_expressions, true, "For Values format: if the field could not be parsed by streaming parser, run SQL parser and try to interpret it as SQL expression.", 0) \
M(Bool, input_format_values_deduce_templates_of_expressions, true, "For Values format: if the field could not be parsed by streaming parser, run SQL parser, deduce template of the SQL expression, try to parse all rows using template and then interpret expression for all rows.", 0) \
M(Bool, input_format_values_accurate_types_of_literals, true, "For Values format: when parsing and interpreting expressions using template, check actual type of literal to avoid possible overflow and precision issues.", 0) \

View File

@ -300,7 +300,7 @@ static void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffe
template <typename Reader>
static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && read_nested)
static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && read_nested, bool allow_unenclosed)
{
ColumnArray & column_array = assert_cast<ColumnArray &>(column);
ColumnArray::Offsets & offsets = column_array.getOffsets();
@ -308,7 +308,12 @@ static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && r
IColumn & nested_column = column_array.getData();
size_t size = 0;
assertChar('[', istr);
bool has_braces = false;
if (checkChar('[', istr))
has_braces = true;
else if (!allow_unenclosed)
throw Exception(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, "Array does not start with '[' character");
try
{
@ -320,7 +325,9 @@ static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && r
if (*istr.position() == ',')
++istr.position();
else
throw Exception("Cannot read array from text", ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT);
throw Exception(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT,
"Cannot read array from text, expected comma or end of array, found '{}'",
*istr.position());
}
first = false;
@ -335,7 +342,11 @@ static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && r
skipWhitespaceIfAny(istr);
}
assertChar(']', istr);
if (has_braces)
assertChar(']', istr);
else /// If array is not enclosed in braces, we read until EOF.
assertEOF(istr);
}
catch (...)
{
@ -364,7 +375,7 @@ void DataTypeArray::deserializeText(IColumn & column, ReadBuffer & istr, const F
[&](IColumn & nested_column)
{
nested->deserializeAsTextQuoted(nested_column, istr, settings);
});
}, false);
}
void DataTypeArray::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
@ -390,7 +401,11 @@ void DataTypeArray::serializeTextJSON(const IColumn & column, size_t row_num, Wr
void DataTypeArray::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
deserializeTextImpl(column, istr, [&](IColumn & nested_column) { nested->deserializeAsTextJSON(nested_column, istr, settings); });
deserializeTextImpl(column, istr,
[&](IColumn & nested_column)
{
nested->deserializeAsTextJSON(nested_column, istr, settings);
}, false);
}
@ -429,7 +444,23 @@ void DataTypeArray::deserializeTextCSV(IColumn & column, ReadBuffer & istr, cons
String s;
readCSV(s, istr, settings.csv);
ReadBufferFromString rb(s);
deserializeText(column, rb, settings);
if (settings.csv.input_format_arrays_as_nested_csv)
{
deserializeTextImpl(column, rb,
[&](IColumn & nested_column)
{
nested->deserializeAsTextCSV(nested_column, rb, settings);
}, true);
}
else
{
deserializeTextImpl(column, rb,
[&](IColumn & nested_column)
{
nested->deserializeAsTextQuoted(nested_column, rb, settings);
}, true);
}
}

View File

@ -64,6 +64,7 @@ FormatSettings getFormatSettings(const Context & context,
format_settings.csv.empty_as_default = settings.input_format_defaults_for_omitted_fields;
format_settings.csv.input_format_enum_as_number = settings.input_format_csv_enum_as_number;
format_settings.csv.unquoted_null_literal_as_null = settings.input_format_csv_unquoted_null_literal_as_null;
format_settings.csv.input_format_arrays_as_nested_csv = settings.input_format_csv_arrays_as_nested_csv;
format_settings.custom.escaping_rule = settings.format_custom_escaping_rule;
format_settings.custom.field_delimiter = settings.format_custom_field_delimiter;
format_settings.custom.result_after_delimiter = settings.format_custom_result_after_delimiter;

View File

@ -71,6 +71,7 @@ struct FormatSettings
bool empty_as_default = false;
bool crlf_end_of_line = false;
bool input_format_enum_as_number = false;
bool input_format_arrays_as_nested_csv = false;
} csv;
struct Custom

View File

@ -0,0 +1,4 @@
['Hello','world','42" TV']
['Hello','world','42" TV']
['Hello','world','42" TV']
['Hello','world','42" TV']

View File

@ -0,0 +1,27 @@
#!/usr/bin/env bash
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
. "$CURDIR"/../shell_config.sh
${CLICKHOUSE_CLIENT} --multiquery --query "
DROP TABLE IF EXISTS test;
CREATE TABLE test (a Array(String)) ENGINE = Memory;
"
${CLICKHOUSE_CLIENT} --query "INSERT INTO test FORMAT CSV" <<END
"['Hello', 'world', '42"" TV']"
END
${CLICKHOUSE_CLIENT} --format_csv_allow_single_quotes 0 --query "INSERT INTO test FORMAT CSV" <<END
"'Hello', 'world', '42"" TV'"
END
${CLICKHOUSE_CLIENT} --input_format_csv_arrays_as_nested_csv 1 --query "INSERT INTO test FORMAT CSV" <<END
"[""Hello"", ""world"", ""42"""" TV""]"
"""Hello"", ""world"", ""42"""" TV"""
END
${CLICKHOUSE_CLIENT} --multiquery --query "
SELECT * FROM test;
DROP TABLE IF EXISTS test;
"