This commit is contained in:
Nikita Mikhaylov 2024-06-12 13:42:28 +00:00
parent 594a0e9a1a
commit 04897eb0fa
11 changed files with 250 additions and 162 deletions

View File

@ -1009,6 +1009,8 @@ class IColumn;
M(Char, format_csv_delimiter, ',', "The character to be considered as a delimiter in CSV data. If setting with a string, a string has to have a length of 1.", 0) \
M(Bool, format_csv_allow_single_quotes, false, "If it is set to true, allow strings in single quotes.", 0) \
M(Bool, format_csv_allow_double_quotes, true, "If it is set to true, allow strings in double quotes.", 0) \
M(Bool, format_csv_serialize_tuple_into_separate_columns, true, "If it set to true, then Tuples in CSV format are serialized as separate columns (that is, their nesting in the tuple is lost)", 0) \
M(Bool, format_csv_deserialize_separate_columns_into_tuple, true, "if it set to true, then separate columns written in CSV format can be deserialized to Tuple column.", 0) \
M(Bool, output_format_csv_crlf_end_of_line, false, "If it is set true, end of line in CSV format will be \\r\\n instead of \\n.", 0) \
M(Bool, input_format_csv_allow_cr_end_of_line, false, "If it is set true, \\r will be allowed at end of line not followed by \\n", 0) \
M(Bool, input_format_csv_enum_as_number, false, "Treat inserted enum values in CSV formats as enum indices", 0) \
@ -1047,6 +1049,7 @@ class IColumn;
M(UInt64, input_format_max_bytes_to_read_for_schema_inference, 32 * 1024 * 1024, "The maximum bytes of data to read for automatic schema inference", 0) \
M(Bool, input_format_csv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in CSV format", 0) \
M(Bool, input_format_csv_try_infer_numbers_from_strings, false, "Try to infer numbers from string fields while schema inference in CSV format", 0) \
M(Bool, input_format_csv_try_infer_strings_from_quoted_tuples, true, "Interpret quoted tuples in the input data as a value of type String.", 0) \
M(Bool, input_format_tsv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in TSV format", 0) \
M(Bool, input_format_csv_detect_header, true, "Automatically detect header with names and types in CSV format", 0) \
M(Bool, input_format_csv_allow_whitespace_or_tab_as_delimiter, false, "Allow to use spaces and tabs(\\t) as field delimiter in the CSV strings", 0) \

View File

@ -172,6 +172,9 @@ static const std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges
{"azure_max_upload_part_size", 5ull*1024*1024*1024, 5ull*1024*1024*1024, "The maximum size of part to upload during multipart upload to Azure blob storage."},
{"azure_upload_part_size_multiply_factor", 2, 2, "Multiply azure_min_upload_part_size by this factor each time azure_multiply_parts_count_threshold parts were uploaded from a single write to Azure blob storage."},
{"azure_upload_part_size_multiply_parts_count_threshold", 500, 500, "Each time this number of parts was uploaded to Azure blob storage, azure_min_upload_part_size is multiplied by azure_upload_part_size_multiply_factor."},
{"format_csv_serialize_tuple_into_separate_columns", true, true, "A new way of how interpret tuples in CSV format was added."},
{"format_csv_deserialize_separate_columns_into_tuple", true, true, "A new way of how interpret tuples in CSV format was added."},
{"input_format_csv_try_infer_strings_from_quoted_tuples", true, true, "A new way of how interpret tuples in CSV format was added."}
}},
{"24.2", {{"allow_suspicious_variant_types", true, false, "Don't allow creating Variant type with suspicious variants by default"},
{"validate_experimental_and_suspicious_types_inside_nested_types", false, true, "Validate usage of experimental and suspicious types inside nested types"},

View File

@ -531,26 +531,98 @@ void SerializationTuple::serializeTextXML(const IColumn & column, size_t row_num
void SerializationTuple::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
WriteBufferFromOwnString wb;
serializeText(column, row_num, wb, settings);
writeCSV(wb.str(), ostr);
if (settings.csv.serialize_tuple_into_separate_columns)
{
for (size_t i = 0; i < elems.size(); ++i)
{
if (i != 0)
writeChar(settings.csv.tuple_delimiter, ostr);
elems[i]->serializeTextCSV(extractElementColumn(column, i), row_num, ostr, settings);
}
}
else
{
WriteBufferFromOwnString wb;
serializeText(column, row_num, wb, settings);
writeCSV(wb.str(), ostr);
}
}
void SerializationTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
String s;
readCSV(s, istr, settings.csv);
ReadBufferFromString rb(s);
deserializeText(column, rb, settings, true);
if (settings.csv.deserialize_separate_columns_into_tuple)
{
addElementSafe<void>(elems.size(), column, [&]
{
const size_t size = elems.size();
for (size_t i = 0; i < size; ++i)
{
if (i != 0)
{
skipWhitespaceIfAny(istr);
assertChar(settings.csv.tuple_delimiter, istr);
skipWhitespaceIfAny(istr);
}
auto & element_column = extractElementColumn(column, i);
if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column))
SerializationNullable::deserializeNullAsDefaultOrNestedTextCSV(element_column, istr, settings, elems[i]);
else
elems[i]->deserializeTextCSV(element_column, istr, settings);
}
return true;
});
}
else
{
String s;
readCSV(s, istr, settings.csv);
ReadBufferFromString rb(s);
deserializeText(column, rb, settings, true);
}
}
bool SerializationTuple::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
String s;
if (!tryReadCSV(s, istr, settings.csv))
return false;
ReadBufferFromString rb(s);
return tryDeserializeText(column, rb, settings, true);
if (settings.csv.deserialize_separate_columns_into_tuple)
{
return addElementSafe<bool>(elems.size(), column, [&]
{
const size_t size = elems.size();
for (size_t i = 0; i < size; ++i)
{
if (i != 0)
{
skipWhitespaceIfAny(istr);
if (!checkChar(settings.csv.tuple_delimiter, istr))
return false;
skipWhitespaceIfAny(istr);
}
auto & element_column = extractElementColumn(column, i);
if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column))
{
if (!SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextCSV(element_column, istr, settings, elems[i]))
return false;
}
else
{
if (!elems[i]->tryDeserializeTextCSV(element_column, istr, settings))
return false;
}
}
return true;
});
}
else
{
String s;
if (!tryReadCSV(s, istr, settings.csv))
return false;
ReadBufferFromString rb(s);
return tryDeserializeText(column, rb, settings, true);
}
}
struct SerializeBinaryBulkStateTuple : public ISerialization::SerializeBinaryBulkState

View File

@ -303,7 +303,7 @@ DataTypePtr tryInferDataTypeByEscapingRule(const String & field, const FormatSet
auto type = tryInferDataTypeForSingleField(data, format_settings);
/// If we couldn't infer any type or it's a number and csv.try_infer_numbers_from_strings = 0, we determine it as a string.
if (!type || (isNumber(type) && !format_settings.csv.try_infer_numbers_from_strings))
if (!type || (isTuple(type) && format_settings.csv.try_infer_strings_from_quoted_tuples) || (isNumber(type) && !format_settings.csv.try_infer_numbers_from_strings))
return std::make_shared<DataTypeString>();
return type;

View File

@ -77,6 +77,8 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
format_settings.avro.output_rows_in_file = settings.output_format_avro_rows_in_file;
format_settings.csv.allow_double_quotes = settings.format_csv_allow_double_quotes;
format_settings.csv.allow_single_quotes = settings.format_csv_allow_single_quotes;
format_settings.csv.serialize_tuple_into_separate_columns = settings.format_csv_serialize_tuple_into_separate_columns;
format_settings.csv.deserialize_separate_columns_into_tuple = settings.format_csv_deserialize_separate_columns_into_tuple;
format_settings.csv.crlf_end_of_line = settings.output_format_csv_crlf_end_of_line;
format_settings.csv.allow_cr_end_of_line = settings.input_format_csv_allow_cr_end_of_line;
format_settings.csv.delimiter = settings.format_csv_delimiter;
@ -94,6 +96,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
format_settings.csv.allow_variable_number_of_columns = settings.input_format_csv_allow_variable_number_of_columns;
format_settings.csv.use_default_on_bad_values = settings.input_format_csv_use_default_on_bad_values;
format_settings.csv.try_infer_numbers_from_strings = settings.input_format_csv_try_infer_numbers_from_strings;
format_settings.csv.try_infer_strings_from_quoted_tuples = settings.input_format_csv_try_infer_strings_from_quoted_tuples;
format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter;

View File

@ -153,6 +153,8 @@ struct FormatSettings
char delimiter = ',';
bool allow_single_quotes = true;
bool allow_double_quotes = true;
bool serialize_tuple_into_separate_columns = true;
bool deserialize_separate_columns_into_tuple = true;
bool empty_as_default = false;
bool crlf_end_of_line = false;
bool allow_cr_end_of_line = false;

View File

@ -1,11 +1,11 @@
"Hello, ""World""",123,"[1,2,3]","(456,['abc','def'])","Newline
"Hello, ""World""",123,"[1,2,3]",456,"['abc','def']","Newline
here"
"x","y","z","a","b"
"Hello, ""World""",123,"[1,2,3]","(456,['abc','def'])","Newline
"Hello, ""World""",123,"[1,2,3]",456,"['abc','def']","Newline
here"
"x","y","z","a","b"
"String","UInt8","Array(UInt8)","Tuple(UInt16, Array(String))","String"
"Hello, ""World""",123,"[1,2,3]","(456,['abc','def'])","Newline
"Hello, ""World""",123,"[1,2,3]",456,"['abc','def']","Newline
here"
0,"0","[]","2000-01-01","2000-01-01 00:00:00"
1,"1","[0]","2000-01-02","2000-01-01 00:00:01"

View File

@ -11,8 +11,8 @@ $CLICKHOUSE_CLIENT --query="CREATE TABLE default_by_other_column (a Float32 DEFA
echo 'CSV'
echo '\N, 1, \N, "2019-07-22", "[10, 20, 30]", \N
1, world, 3, "2019-07-23", \N, "('\''tuple'\'', 3.14)"
2, \N, 123, \N, "[]", "('\''test'\'', 2.71828)"
1, world, 3, "2019-07-23", \N, tuple, 3.14
2, \N, 123, \N, "[]", test, 2.71828
3, \N, \N, \N, \N, \N' | $CLICKHOUSE_CLIENT --input_format_null_as_default=1 --query="INSERT INTO null_as_default FORMAT CSV";
$CLICKHOUSE_CLIENT --query="SELECT * FROM null_as_default ORDER BY i";
$CLICKHOUSE_CLIENT --query="TRUNCATE TABLE null_as_default";

View File

@ -1,107 +1,107 @@
TSV
c1 Nullable(Int64)
c2 Nullable(String)
c3 Array(Nullable(Int64))
c4 Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64))
c1 Nullable(Int64)
c2 Nullable(String)
c3 Array(Nullable(Int64))
c4 Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64))
42 Some string [1,2,3,4] (1,2,3)
42 abcd [] (4,5,6)
c1 Nullable(String)
c1 Nullable(String)
[({\'key\' : 42.42}, [\'String\', \'String2\'], 42.42), ({}, [], -42), ({\'key2\' : NULL}, [NULL], NULL)]
[]
[({}, [], 0)]
[({}, [NULL], NULL)]
[({}, [\'String3\'], NULL)]
[({\'key3\': NULL}, []), NULL]
c1 Array(Tuple(Map(String, Nullable(Float64)), Array(Nullable(String)), Nullable(Float64)))
c1 Array(Tuple(Map(String, Nullable(Float64)), Array(Nullable(String)), Nullable(Float64)))
[({'key':42.42},['String','String2'],42.42),({},[],-42),({'key2':NULL},[NULL],NULL)]
[]
[({},[],0)]
[({},[NULL],NULL)]
[({},['String3'],NULL)]
[({'key3':NULL},[],NULL)]
c1 Nullable(Bool)
c1 Nullable(Bool)
true
false
\N
c1 Array(Nullable(Bool))
c1 Array(Nullable(Bool))
[true,NULL]
[]
[NULL]
[false]
c1 Nullable(String)
c1 Nullable(String)
[]
c1 Nullable(String)
c1 Nullable(String)
{}
c1 Nullable(String)
c1 Nullable(String)
()
c1 Nullable(String)
c1 Nullable(String)
[1, 2, 3
c1 Nullable(String)
c1 Nullable(String)
[(1, 2, 3 4)]
c1 Nullable(String)
c1 Nullable(String)
[1, 2, 3 + 4]
c1 Nullable(String)
c1 Nullable(String)
(1, 2,
c1 Nullable(String)
c1 Nullable(String)
[1, Some trash, 42.2]
c1 Nullable(String)
c1 Nullable(String)
[1, \'String\', {\'key\' : 2}]
c1 Nullable(String)
c1 Nullable(String)
{\'key\' : 1, [1] : 10}
c1 Nullable(String)
c1 Nullable(String)
{}{}
c1 Nullable(String)
c1 Nullable(String)
[1, 2, 3
c1 Nullable(String)
c1 Nullable(String)
[abc, def]
c1 Array(Nullable(String))
c1 Array(Nullable(String))
['abc','def']
c1 Nullable(String)
c1 Nullable(String)
[\'string]
c1 Nullable(String)
c1 Nullable(String)
\'string
c1 Nullable(Float64)
c1 Nullable(Float64)
42.42
c1 Nullable(String)
c1 Nullable(String)
42.42sometrash
c1 Nullable(String)
c1 Nullable(String)
[42.42sometrash, 42.42]
CSV
c1 Nullable(String)
c2 Nullable(String)
c3 Array(Nullable(Int64))
c4 Array(Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64)))
c1 Nullable(String)
c2 Nullable(String)
c3 Array(Nullable(Int64))
c4 Array(Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64)))
42 Some string [1,2,3,4] [(1,2,3)]
42\\ abcd [] [(4,5,6)]
c1 Nullable(String)
c1 Nullable(String)
[({\'key\' : 42.42}, [\'String\', \'String2\'], 42.42), ({}, [], -42), ({\'key2\' : NULL}, [NULL], NULL)]
[]
[({}, [], 0)]
[({}, [NULL], NULL)]
[({}, [\'String3\'], NULL)]
[({\'key3\': NULL}, []), NULL]
c1 Array(Tuple(Map(String, Nullable(Float64)), Array(Nullable(String)), Nullable(Float64)))
c1 Array(Tuple(Map(String, Nullable(Float64)), Array(Nullable(String)), Nullable(Float64)))
[({'key':42.42},['String','String2'],42.42),({},[],-42),({'key2':NULL},[NULL],NULL)]
[]
[({},[],0)]
[({},[NULL],NULL)]
[({},['String3'],NULL)]
[({'key3':NULL},[],NULL)]
c1 Nullable(Bool)
c1 Nullable(Bool)
true
false
\N
c1 Array(Nullable(Bool))
c1 Array(Nullable(Bool))
[true,NULL]
[]
[NULL]
[false]
c1 Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64))
(1,2,3)
c1 Nullable(String)
c1 Nullable(String)
(1, 2, 3)
c1 Nullable(String)
123.123
c1 Array(Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64)))
c1 Array(Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64)))
[(1,2,3)]
c1 Array(Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64)))
c1 Array(Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64)))
[(1,2,3)]

View File

@ -1,122 +1,123 @@
Parquet
a Nullable(UInt64)
b Nullable(String)
c Array(Nullable(UInt64))
d Tuple(\n a Nullable(UInt64),\n b Nullable(String))
a Nullable(UInt64)
b Nullable(String)
c Array(Nullable(UInt64))
d Tuple(\n a Nullable(UInt64),\n b Nullable(String))
ORC
a Nullable(Int64)
b Nullable(String)
c Array(Nullable(Int64))
d Tuple(\n a Nullable(Int64),\n b Nullable(String))
a Nullable(Int64)
b Nullable(String)
c Array(Nullable(Int64))
d Tuple(\n a Nullable(Int64),\n b Nullable(String))
Arrow
a Nullable(UInt64)
b Nullable(String)
c Array(Nullable(UInt64))
d Tuple(\n a Nullable(UInt64),\n b Nullable(String))
a Nullable(UInt64)
b Nullable(String)
c Array(Nullable(UInt64))
d Tuple(\n a Nullable(UInt64),\n b Nullable(String))
ArrowStream
a Nullable(UInt64)
b Nullable(String)
c Array(Nullable(UInt64))
d Tuple(\n a Nullable(UInt64),\n b Nullable(String))
a Nullable(UInt64)
b Nullable(String)
c Array(Nullable(UInt64))
d Tuple(\n a Nullable(UInt64),\n b Nullable(String))
Avro
a Int64
b String
c Array(Int64)
d Tuple(\n a Int64,\n b String)
a Int64
b String
c Array(Int64)
d Tuple(\n a Int64,\n b String)
Native
a UInt64
b String
c Array(UInt64)
d Tuple(\n a UInt64,\n b String)
a UInt64
b String
c Array(UInt64)
d Tuple(\n a UInt64,\n b String)
BSONEachRow
a Nullable(Int64)
b Nullable(String)
c Array(Nullable(Int64))
d Tuple(\n a Nullable(Int64),\n b Nullable(String))
a Nullable(Int64)
b Nullable(String)
c Array(Nullable(Int64))
d Tuple(\n a Nullable(Int64),\n b Nullable(String))
JSONCompact
a UInt64
b String
c Array(UInt64)
d Tuple(\n a UInt64,\n b String)
a UInt64
b String
c Array(UInt64)
d Tuple(\n a UInt64,\n b String)
Values
c1 Nullable(UInt64)
c2 Nullable(String)
c3 Array(Nullable(UInt64))
c4 Tuple(Nullable(UInt64), Nullable(String))
c1 Nullable(UInt64)
c2 Nullable(String)
c3 Array(Nullable(UInt64))
c4 Tuple(Nullable(UInt64), Nullable(String))
TSKV
a Nullable(String)
b Nullable(String)
c Array(Nullable(UInt64))
d Nullable(String)
a Nullable(String)
b Nullable(String)
c Array(Nullable(UInt64))
d Nullable(String)
JSONObjectEachRow
a Nullable(String)
b Nullable(String)
c Array(Nullable(String))
d Tuple(\n a Nullable(String),\n b Nullable(String))
a Nullable(String)
b Nullable(String)
c Array(Nullable(String))
d Tuple(\n a Nullable(String),\n b Nullable(String))
JSONColumns
a Nullable(String)
b Nullable(String)
c Array(Nullable(String))
d Tuple(\n a Nullable(String),\n b Nullable(String))
a Nullable(String)
b Nullable(String)
c Array(Nullable(String))
d Tuple(\n a Nullable(String),\n b Nullable(String))
JSONCompactColumns
c1 Nullable(String)
c2 Nullable(String)
c3 Array(Nullable(String))
c4 Tuple(\n a Nullable(String),\n b Nullable(String))
c1 Nullable(String)
c2 Nullable(String)
c3 Array(Nullable(String))
c4 Tuple(\n a Nullable(String),\n b Nullable(String))
JSONCompact
a UInt64
b String
c Array(UInt64)
d Tuple(\n a UInt64,\n b String)
a UInt64
b String
c Array(UInt64)
d Tuple(\n a UInt64,\n b String)
JSON
a UInt64
b String
c Array(UInt64)
d Tuple(\n a UInt64,\n b String)
a UInt64
b String
c Array(UInt64)
d Tuple(\n a UInt64,\n b String)
TSV
c1 Nullable(UInt64)
c2 Nullable(String)
c3 Array(Nullable(UInt64))
c4 Tuple(Nullable(UInt64), Nullable(String))
c1 Nullable(UInt64)
c2 Nullable(String)
c3 Array(Nullable(UInt64))
c4 Tuple(Nullable(UInt64), Nullable(String))
CSV
c1 Nullable(UInt64)
c2 Nullable(String)
c3 Array(Nullable(UInt64))
c4 Tuple(Nullable(UInt64), Nullable(String))
a Nullable(String)
b Nullable(String)
c Array(Nullable(String))
d Tuple(\n a Nullable(String),\n b Nullable(String))
a UInt64
b String
c Array(UInt64)
d Tuple(\n a UInt64,\n b String)
a Nullable(String)
b Nullable(String)
c Array(Nullable(String))
d Tuple(\n a Nullable(String),\n b Nullable(String))
a Nullable(String)
b Nullable(String)
c Array(Nullable(String))
d Tuple(\n a Nullable(String),\n b Nullable(String))
a Nullable(String)
b Nullable(String)
c Array(Nullable(String))
d Tuple(\n a Nullable(String),\n b Nullable(String))
a Nullable(String)
b Nullable(String)
c Array(Nullable(String))
d Tuple(\n a Nullable(String),\n b Nullable(String))
a Nullable(String)
b Nullable(String)
c Array(Nullable(String))
d Tuple(\n a Nullable(String),\n b Nullable(String))
c1 Nullable(UInt64)
c2 Nullable(String)
c3 Array(Nullable(UInt64))
c4 Nullable(UInt64)
c5 Nullable(String)
a Nullable(String)
b Nullable(String)
c Array(Nullable(String))
d Tuple(\n a Nullable(String),\n b Nullable(String))
a UInt64
b String
c Array(UInt64)
d Tuple(\n a UInt64,\n b String)
a Nullable(String)
b Nullable(String)
c Array(Nullable(String))
d Tuple(\n a Nullable(String),\n b Nullable(String))
a Nullable(String)
b Nullable(String)
c Array(Nullable(String))
d Tuple(\n a Nullable(String),\n b Nullable(String))
a Nullable(String)
b Nullable(String)
c Array(Nullable(String))
d Tuple(\n a Nullable(String),\n b Nullable(String))
a Nullable(String)
b Nullable(String)
c Array(Nullable(String))
d Tuple(\n a Nullable(String),\n b Nullable(String))
a Nullable(String)
b Nullable(String)
c Array(Nullable(String))
d Tuple(\n a Nullable(String),\n b Nullable(String))
1
a Nullable(String)
b Nullable(String)
c Array(Nullable(String))
d Tuple(\n a Nullable(String),\n b Nullable(String))
a Nullable(String)
b Nullable(String)
c Array(Nullable(String))
d Tuple(\n a Nullable(String),\n b Nullable(String))
a Nullable(String)
b Nullable(String)
c Array(Nullable(String))
d Tuple(\n a Nullable(String),\n b Nullable(String))
a Nullable(String)
b Nullable(String)
c Array(Nullable(String))
d Tuple(\n a Nullable(String),\n b Nullable(String))

View File

@ -1,5 +1,9 @@
-- Tags: no-parallel
SET format_csv_serialize_tuple_into_separate_columns = false;
SET format_csv_deserialize_separate_columns_into_tuple = false;
SET input_format_csv_try_infer_strings_from_quoted_tuples = false;
insert into function file('02977_1.csv') select '20240305', 1, ['s', 'd'], map('a', 2), tuple('222', 33, map('abc', 5)) SETTINGS engine_file_truncate_on_insert=1;
desc file('02977_1.csv');
select * from file('02977_1.csv') settings max_threads=1;