Incorporate review changes

This commit is contained in:
Blargian 2024-05-24 21:20:20 +02:00
parent dacbe1a427
commit 04800f596c
7 changed files with 67 additions and 33 deletions

View File

@ -1066,7 +1066,7 @@ class IColumn;
M(Bool, input_format_json_defaults_for_missing_elements_in_named_tuple, true, "Insert default value in named tuple element if it's missing in json object", 0) \
M(Bool, input_format_json_throw_on_bad_escape_sequence, true, "Throw an exception if JSON string contains bad escape sequence in JSON input formats. If disabled, bad escape sequences will remain as is in the data", 0) \
M(Bool, input_format_json_ignore_unnecessary_fields, true, "Ignore unnecessary fields and not parse them. Enabling this may not throw exceptions on json strings of invalid format or with duplicated fields", 0) \
M(Bool, input_format_json_infer_variant_from_multi_type_array, false, "Try to infer variant type rather than tuple when column/array has multiple", 0) \
M(Bool, input_format_try_infer_variants, false, "Try to infer the Variant type in text formats when there is more than one possible type for column/array elements", 0) \
M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \
M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \
M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \

View File

@ -93,7 +93,7 @@ static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> sett
{"prefer_external_sort_block_bytes", 0, DEFAULT_BLOCK_SIZE * 256, "Prefer maximum block bytes for external sort, reduce the memory usage during merging."},
{"input_format_force_null_for_omitted_fields", false, false, "Disable type-defaults for omitted fields when needed"},
{"output_format_pretty_preserve_border_for_multiline_string", 1, 1, "Applies better rendering for multiline strings."},
{"input_format_json_infer_variant_from_multi_type_array", 0, 0, "Allows inference of variant type if columns/arrays have multiple types."},
{"input_format_try_infer_variants", 0, 0, "Try to infer Variant type in text formats when there is more than one possible type for column/array elements"},
}},
{"24.4", {{"input_format_json_throw_on_bad_escape_sequence", true, true, "Allow to save JSON strings with bad escape sequences"},
{"max_parsing_threads", 0, 0, "Add a separate setting to control number of threads in parallel parsing from files"},

View File

@ -137,7 +137,6 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
format_settings.json.read_arrays_as_strings = settings.input_format_json_read_arrays_as_strings;
format_settings.json.try_infer_numbers_from_strings = settings.input_format_json_try_infer_numbers_from_strings;
format_settings.json.infer_incomplete_types_as_strings = settings.input_format_json_infer_incomplete_types_as_strings;
format_settings.json.infer_variant_from_multi_type_array = settings.input_format_json_infer_variant_from_multi_type_array;
format_settings.json.validate_types_from_metadata = settings.input_format_json_validate_types_from_metadata;
format_settings.json.validate_utf8 = settings.output_format_json_validate_utf8;
format_settings.json_object_each_row.column_for_object_name = settings.format_json_object_each_row_column_for_object_name;
@ -266,6 +265,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
format_settings.max_parser_depth = context->getSettingsRef().max_parser_depth;
format_settings.client_protocol_version = context->getClientProtocolVersion();
format_settings.date_time_overflow_behavior = settings.date_time_overflow_behavior;
format_settings.try_infer_variant = settings.input_format_try_infer_variants;
/// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context
if (format_settings.schema.is_server)

View File

@ -36,6 +36,7 @@ struct FormatSettings
bool decimal_trailing_zeros = false;
bool defaults_for_omitted_fields = true;
bool is_writing_to_terminal = false;
bool try_infer_variant = false;
bool seekable_read = true;
UInt64 max_rows_to_read_for_schema_inference = 25000;
@ -223,7 +224,6 @@ struct FormatSettings
bool compact_allow_variable_number_of_columns = false;
bool try_infer_objects_as_tuples = false;
bool infer_incomplete_types_as_strings = true;
bool infer_variant_from_multi_type_array = false;
bool throw_on_bad_escape_sequence = true;
bool ignore_unnecessary_fields = true;
} json{};

View File

@ -239,6 +239,16 @@ namespace
return true;
}
bool checkIfTypesContainVariant(const DataTypes & types)
{
for (size_t i = 0; i < types.size(); ++i)
{
if (isVariant(types[i]))
return true;
}
return false;
}
void updateTypeIndexes(DataTypes & data_types, TypeIndexesSet & type_indexes)
{
type_indexes.clear();
@ -308,20 +318,31 @@ namespace
type_indexes.erase(TypeIndex::UInt64);
}
/// if setting input_format_json_infer_variant_from_multi_type_array is true
/// if setting try_infer_variant is true
/// and nested types are not equal then we convert to type variant.
void transformVariant(DataTypes & data_types, TypeIndexesSet & type_indexes)
{
auto typesAreEqual = checkIfTypesAreEqual(data_types);
auto typesContainVariant = checkIfTypesContainVariant(data_types);
if (typesAreEqual || typesContainVariant)
return;
DataTypes new_data_types;
TypeIndexesSet new_type_indexes;
auto variant_type = std::make_shared<DataTypeVariant>(data_types);
/// replace separate types with a single variant type
size_t i = 0;
while (i != data_types.size())
{
new_data_types.push_back(variant_type);
new_type_indexes.insert(TypeIndex::Variant);
i++;
}
data_types.clear();
type_indexes.clear();
data_types.push_back(variant_type);
type_indexes.insert(TypeIndex::Variant);
/// make the second type variant as well
data_types.push_back(variant_type);
type_indexes.insert(TypeIndex::Variant);
data_types = new_data_types;
type_indexes = new_type_indexes;
}
/// If we have only Date and DateTime types, convert Date to DateTime,
@ -661,16 +682,14 @@ namespace
if (settings.try_infer_dates || settings.try_infer_datetimes)
transformDatesAndDateTimes(data_types, type_indexes);
if (settings.try_infer_variant)
transformVariant(data_types, type_indexes);
if constexpr (!is_json)
return;
/// Check settings specific for JSON formats.
if (settings.json.infer_variant_from_multi_type_array)
{
transformVariant(data_types, type_indexes);
}
/// Convert numbers inferred from strings back to strings if needed.
if (settings.json.try_infer_numbers_from_strings || settings.json.read_numbers_as_strings)
transformJSONNumbersBackToString(data_types, settings, type_indexes, json_info);
@ -685,6 +704,10 @@ namespace
if (settings.json.try_infer_objects_as_tuples)
mergeJSONPaths(data_types, type_indexes, settings, json_info);
if (settings.try_infer_variant)
transformVariant(data_types, type_indexes);
};
auto transform_complex_types = [&](DataTypes & data_types, TypeIndexesSet & type_indexes)
@ -696,14 +719,12 @@ namespace
/// If there is at least one non Nothing type, change all Nothing types to it.
transformNothingComplexTypes(data_types, type_indexes);
if (settings.try_infer_variant)
transformVariant(data_types, type_indexes);
if constexpr (!is_json)
return;
if (settings.json.infer_variant_from_multi_type_array)
{
transformVariant(data_types, type_indexes);
}
/// Convert JSON tuples with same nested types to arrays.
transformTuplesWithEqualNestedTypesToArrays(data_types, type_indexes);
@ -715,6 +736,9 @@ namespace
if (json_info && json_info->allow_merging_named_tuples)
mergeNamedTuples(data_types, type_indexes, settings, json_info);
if (settings.try_infer_variant)
transformVariant(data_types, type_indexes);
};
transformTypesRecursively(types, transform_simple_types, transform_complex_types);

View File

@ -2,7 +2,7 @@
┃ arr ┃ toTypeName(arr) ┃
┡━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
1. │ [1,'Hello',(32)] │ Array(Variant(Int64, String, Tuple(…│
│ │… a Int64)))
│ │… a Nullable(Int64))))
└──────────────────┴─────────────────────────────────────┘
┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ x ┃ toTypeName(x) ┃
@ -11,12 +11,21 @@
├───────┼────────────────────────┤
2. │ Hello │ Variant(Int64, String) │
└───────┴────────────────────────┘
┏━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ x ┃ toTypeName(x) ┃
┡━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
1. │ [1,2,3] │ Variant(Array(Int64), Tuple(…│
│ │… a Int64)) │
├─────────┼──────────────────────────────┤
2. │ (42) │ Variant(Array(Int64), Tuple(…│
│ │… a Int64)) │
└─────────┴──────────────────────────────┘
┏━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ x ┃ toTypeName(x) ┃
┡━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
1. │ [1,2,3] │ Variant(Array(Nullable(Int64)), Tuple(…│
│ │… a Nullable(Int64))) │
├─────────┼────────────────────────────────────────┤
2. │ (42) │ Variant(Array(Nullable(Int64)), Tuple(…│
│ │… a Nullable(Int64))) │
└─────────┴────────────────────────────────────────┘
┏━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓
┃ c1 ┃ toTypeName(c1) ┃ c2 ┃ toTypeName(c2) ┃
┡━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩
1. │ 1 │ Nullable(Int64) │ Hello World! │ Nullable(String) │
├────┼─────────────────┼──────────────┼──────────────────┤
2. │ 2 │ Nullable(Int64) │ [1,2,3] │ Nullable(String) │
├────┼─────────────────┼──────────────┼──────────────────┤
3. │ 3 │ Nullable(Int64) │ 2020-01-01 │ Nullable(String) │
└────┴─────────────────┴──────────────┴──────────────────┘

View File

@ -1,4 +1,5 @@
SET input_format_json_infer_variant_from_multi_type_array=1;
SET input_format_try_infer_variants=1;
SELECT arr, toTypeName(arr) FROM format('JSONEachRow', '{"arr" : [1, "Hello", {"a" : 32}]}') FORMAT Pretty;
SELECT x, toTypeName(x) FROM format('JSONEachRow', '{"x" : 42}, {"x" : "Hello"}') FORMAT Pretty;
SELECT x, toTypeName(x) FROM format('JSONEachRow', '{"x" : [1, 2, 3]}, {"x" : {"a" : 42}}') FORMAT Pretty;
SELECT c1, toTypeName(c1), c2, toTypeName(c2) FROM format('CSV', '1,Hello World!\n2,"[1,2,3]"\n3,"2020-01-01"\n') FORMAT Pretty;