mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-19 16:20:50 +00:00
Incorporate review changes
This commit is contained in:
parent
dacbe1a427
commit
04800f596c
@ -1066,7 +1066,7 @@ class IColumn;
|
||||
M(Bool, input_format_json_defaults_for_missing_elements_in_named_tuple, true, "Insert default value in named tuple element if it's missing in json object", 0) \
|
||||
M(Bool, input_format_json_throw_on_bad_escape_sequence, true, "Throw an exception if JSON string contains bad escape sequence in JSON input formats. If disabled, bad escape sequences will remain as is in the data", 0) \
|
||||
M(Bool, input_format_json_ignore_unnecessary_fields, true, "Ignore unnecessary fields and not parse them. Enabling this may not throw exceptions on json strings of invalid format or with duplicated fields", 0) \
|
||||
M(Bool, input_format_json_infer_variant_from_multi_type_array, false, "Try to infer variant type rather than tuple when column/array has multiple", 0) \
|
||||
M(Bool, input_format_try_infer_variants, false, "Try to infer the Variant type in text formats when there is more than one possible type for column/array elements", 0) \
|
||||
M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \
|
||||
M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \
|
||||
M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \
|
||||
|
@ -93,7 +93,7 @@ static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> sett
|
||||
{"prefer_external_sort_block_bytes", 0, DEFAULT_BLOCK_SIZE * 256, "Prefer maximum block bytes for external sort, reduce the memory usage during merging."},
|
||||
{"input_format_force_null_for_omitted_fields", false, false, "Disable type-defaults for omitted fields when needed"},
|
||||
{"output_format_pretty_preserve_border_for_multiline_string", 1, 1, "Applies better rendering for multiline strings."},
|
||||
{"input_format_json_infer_variant_from_multi_type_array", 0, 0, "Allows inference of variant type if columns/arrays have multiple types."},
|
||||
{"input_format_try_infer_variants", 0, 0, "Try to infer Variant type in text formats when there is more than one possible type for column/array elements"},
|
||||
}},
|
||||
{"24.4", {{"input_format_json_throw_on_bad_escape_sequence", true, true, "Allow to save JSON strings with bad escape sequences"},
|
||||
{"max_parsing_threads", 0, 0, "Add a separate setting to control number of threads in parallel parsing from files"},
|
||||
|
@ -137,7 +137,6 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
|
||||
format_settings.json.read_arrays_as_strings = settings.input_format_json_read_arrays_as_strings;
|
||||
format_settings.json.try_infer_numbers_from_strings = settings.input_format_json_try_infer_numbers_from_strings;
|
||||
format_settings.json.infer_incomplete_types_as_strings = settings.input_format_json_infer_incomplete_types_as_strings;
|
||||
format_settings.json.infer_variant_from_multi_type_array = settings.input_format_json_infer_variant_from_multi_type_array;
|
||||
format_settings.json.validate_types_from_metadata = settings.input_format_json_validate_types_from_metadata;
|
||||
format_settings.json.validate_utf8 = settings.output_format_json_validate_utf8;
|
||||
format_settings.json_object_each_row.column_for_object_name = settings.format_json_object_each_row_column_for_object_name;
|
||||
@ -266,6 +265,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
|
||||
format_settings.max_parser_depth = context->getSettingsRef().max_parser_depth;
|
||||
format_settings.client_protocol_version = context->getClientProtocolVersion();
|
||||
format_settings.date_time_overflow_behavior = settings.date_time_overflow_behavior;
|
||||
format_settings.try_infer_variant = settings.input_format_try_infer_variants;
|
||||
|
||||
/// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context
|
||||
if (format_settings.schema.is_server)
|
||||
|
@ -36,6 +36,7 @@ struct FormatSettings
|
||||
bool decimal_trailing_zeros = false;
|
||||
bool defaults_for_omitted_fields = true;
|
||||
bool is_writing_to_terminal = false;
|
||||
bool try_infer_variant = false;
|
||||
|
||||
bool seekable_read = true;
|
||||
UInt64 max_rows_to_read_for_schema_inference = 25000;
|
||||
@ -223,7 +224,6 @@ struct FormatSettings
|
||||
bool compact_allow_variable_number_of_columns = false;
|
||||
bool try_infer_objects_as_tuples = false;
|
||||
bool infer_incomplete_types_as_strings = true;
|
||||
bool infer_variant_from_multi_type_array = false;
|
||||
bool throw_on_bad_escape_sequence = true;
|
||||
bool ignore_unnecessary_fields = true;
|
||||
} json{};
|
||||
|
@ -239,6 +239,16 @@ namespace
|
||||
return true;
|
||||
}
|
||||
|
||||
bool checkIfTypesContainVariant(const DataTypes & types)
|
||||
{
|
||||
for (size_t i = 0; i < types.size(); ++i)
|
||||
{
|
||||
if (isVariant(types[i]))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void updateTypeIndexes(DataTypes & data_types, TypeIndexesSet & type_indexes)
|
||||
{
|
||||
type_indexes.clear();
|
||||
@ -308,20 +318,31 @@ namespace
|
||||
type_indexes.erase(TypeIndex::UInt64);
|
||||
}
|
||||
|
||||
/// if setting input_format_json_infer_variant_from_multi_type_array is true
|
||||
/// if setting try_infer_variant is true
|
||||
/// and nested types are not equal then we convert to type variant.
|
||||
void transformVariant(DataTypes & data_types, TypeIndexesSet & type_indexes)
|
||||
{
|
||||
auto typesAreEqual = checkIfTypesAreEqual(data_types);
|
||||
auto typesContainVariant = checkIfTypesContainVariant(data_types);
|
||||
if (typesAreEqual || typesContainVariant)
|
||||
return;
|
||||
|
||||
DataTypes new_data_types;
|
||||
TypeIndexesSet new_type_indexes;
|
||||
|
||||
auto variant_type = std::make_shared<DataTypeVariant>(data_types);
|
||||
/// replace separate types with a single variant type
|
||||
size_t i = 0;
|
||||
while (i != data_types.size())
|
||||
{
|
||||
new_data_types.push_back(variant_type);
|
||||
new_type_indexes.insert(TypeIndex::Variant);
|
||||
i++;
|
||||
}
|
||||
|
||||
data_types.clear();
|
||||
type_indexes.clear();
|
||||
data_types.push_back(variant_type);
|
||||
type_indexes.insert(TypeIndex::Variant);
|
||||
|
||||
/// make the second type variant as well
|
||||
data_types.push_back(variant_type);
|
||||
type_indexes.insert(TypeIndex::Variant);
|
||||
data_types = new_data_types;
|
||||
type_indexes = new_type_indexes;
|
||||
}
|
||||
|
||||
/// If we have only Date and DateTime types, convert Date to DateTime,
|
||||
@ -661,16 +682,14 @@ namespace
|
||||
if (settings.try_infer_dates || settings.try_infer_datetimes)
|
||||
transformDatesAndDateTimes(data_types, type_indexes);
|
||||
|
||||
if (settings.try_infer_variant)
|
||||
transformVariant(data_types, type_indexes);
|
||||
|
||||
if constexpr (!is_json)
|
||||
return;
|
||||
|
||||
/// Check settings specific for JSON formats.
|
||||
|
||||
if (settings.json.infer_variant_from_multi_type_array)
|
||||
{
|
||||
transformVariant(data_types, type_indexes);
|
||||
}
|
||||
|
||||
/// Convert numbers inferred from strings back to strings if needed.
|
||||
if (settings.json.try_infer_numbers_from_strings || settings.json.read_numbers_as_strings)
|
||||
transformJSONNumbersBackToString(data_types, settings, type_indexes, json_info);
|
||||
@ -685,6 +704,10 @@ namespace
|
||||
|
||||
if (settings.json.try_infer_objects_as_tuples)
|
||||
mergeJSONPaths(data_types, type_indexes, settings, json_info);
|
||||
|
||||
if (settings.try_infer_variant)
|
||||
transformVariant(data_types, type_indexes);
|
||||
|
||||
};
|
||||
|
||||
auto transform_complex_types = [&](DataTypes & data_types, TypeIndexesSet & type_indexes)
|
||||
@ -696,14 +719,12 @@ namespace
|
||||
/// If there is at least one non Nothing type, change all Nothing types to it.
|
||||
transformNothingComplexTypes(data_types, type_indexes);
|
||||
|
||||
if (settings.try_infer_variant)
|
||||
transformVariant(data_types, type_indexes);
|
||||
|
||||
if constexpr (!is_json)
|
||||
return;
|
||||
|
||||
if (settings.json.infer_variant_from_multi_type_array)
|
||||
{
|
||||
transformVariant(data_types, type_indexes);
|
||||
}
|
||||
|
||||
/// Convert JSON tuples with same nested types to arrays.
|
||||
transformTuplesWithEqualNestedTypesToArrays(data_types, type_indexes);
|
||||
|
||||
@ -715,6 +736,9 @@ namespace
|
||||
|
||||
if (json_info && json_info->allow_merging_named_tuples)
|
||||
mergeNamedTuples(data_types, type_indexes, settings, json_info);
|
||||
|
||||
if (settings.try_infer_variant)
|
||||
transformVariant(data_types, type_indexes);
|
||||
};
|
||||
|
||||
transformTypesRecursively(types, transform_simple_types, transform_complex_types);
|
||||
|
@ -2,7 +2,7 @@
|
||||
┃ arr ┃ toTypeName(arr) ┃
|
||||
┡━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
||||
1. │ [1,'Hello',(32)] │ Array(Variant(Int64, String, Tuple(…│
|
||||
│ │… a Int64))) │
|
||||
│ │… a Nullable(Int64)))) │
|
||||
└──────────────────┴─────────────────────────────────────┘
|
||||
┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ x ┃ toTypeName(x) ┃
|
||||
@ -11,12 +11,21 @@
|
||||
├───────┼────────────────────────┤
|
||||
2. │ Hello │ Variant(Int64, String) │
|
||||
└───────┴────────────────────────┘
|
||||
┏━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ x ┃ toTypeName(x) ┃
|
||||
┡━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
||||
1. │ [1,2,3] │ Variant(Array(Int64), Tuple(…│
|
||||
│ │… a Int64)) │
|
||||
├─────────┼──────────────────────────────┤
|
||||
2. │ (42) │ Variant(Array(Int64), Tuple(…│
|
||||
│ │… a Int64)) │
|
||||
└─────────┴──────────────────────────────┘
|
||||
┏━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ x ┃ toTypeName(x) ┃
|
||||
┡━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
||||
1. │ [1,2,3] │ Variant(Array(Nullable(Int64)), Tuple(…│
|
||||
│ │… a Nullable(Int64))) │
|
||||
├─────────┼────────────────────────────────────────┤
|
||||
2. │ (42) │ Variant(Array(Nullable(Int64)), Tuple(…│
|
||||
│ │… a Nullable(Int64))) │
|
||||
└─────────┴────────────────────────────────────────┘
|
||||
┏━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓
|
||||
┃ c1 ┃ toTypeName(c1) ┃ c2 ┃ toTypeName(c2) ┃
|
||||
┡━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩
|
||||
1. │ 1 │ Nullable(Int64) │ Hello World! │ Nullable(String) │
|
||||
├────┼─────────────────┼──────────────┼──────────────────┤
|
||||
2. │ 2 │ Nullable(Int64) │ [1,2,3] │ Nullable(String) │
|
||||
├────┼─────────────────┼──────────────┼──────────────────┤
|
||||
3. │ 3 │ Nullable(Int64) │ 2020-01-01 │ Nullable(String) │
|
||||
└────┴─────────────────┴──────────────┴──────────────────┘
|
||||
|
@ -1,4 +1,5 @@
|
||||
SET input_format_json_infer_variant_from_multi_type_array=1;
|
||||
SET input_format_try_infer_variants=1;
|
||||
SELECT arr, toTypeName(arr) FROM format('JSONEachRow', '{"arr" : [1, "Hello", {"a" : 32}]}') FORMAT Pretty;
|
||||
SELECT x, toTypeName(x) FROM format('JSONEachRow', '{"x" : 42}, {"x" : "Hello"}') FORMAT Pretty;
|
||||
SELECT x, toTypeName(x) FROM format('JSONEachRow', '{"x" : [1, 2, 3]}, {"x" : {"a" : 42}}') FORMAT Pretty;
|
||||
SELECT c1, toTypeName(c1), c2, toTypeName(c2) FROM format('CSV', '1,Hello World!\n2,"[1,2,3]"\n3,"2020-01-01"\n') FORMAT Pretty;
|
Loading…
Reference in New Issue
Block a user