mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-28 02:21:59 +00:00
Merge pull request #60808 from Avogar/json-ambg-tuple-inference
Small improvements in JSON schema inference
This commit is contained in:
commit
9546b3f5fb
@ -549,6 +549,48 @@ Result:
|
|||||||
└───────┴─────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
└───────┴─────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
||||||
```
|
```
|
||||||
|
|
||||||
|
##### input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects
|
||||||
|
|
||||||
|
Enabling this setting allows to use String type for ambiguous paths during named tuples inference from JSON objects (when `input_format_json_try_infer_named_tuples_from_objects` is enabled) instead of an exception.
|
||||||
|
It allows to read JSON objects as named Tuples even if there are ambiguous paths.
|
||||||
|
|
||||||
|
Disabled by default.
|
||||||
|
|
||||||
|
**Examples**
|
||||||
|
|
||||||
|
With disabled setting:
|
||||||
|
```sql
|
||||||
|
SET input_format_json_try_infer_named_tuples_from_objects = 1;
|
||||||
|
SET input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects = 0;
|
||||||
|
DESC format(JSONEachRow, '{"obj" : {"a" : 42}}, {"obj" : {"a" : {"b" : "Hello"}}}');
|
||||||
|
```
|
||||||
|
Result:
|
||||||
|
|
||||||
|
```text
|
||||||
|
Code: 636. DB::Exception: The table structure cannot be extracted from a JSONEachRow format file. Error:
|
||||||
|
Code: 117. DB::Exception: JSON objects have ambiguous data: in some objects path 'a' has type 'Int64' and in some - 'Tuple(b String)'. You can enable setting input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects to use String type for path 'a'. (INCORRECT_DATA) (version 24.3.1.1).
|
||||||
|
You can specify the structure manually. (CANNOT_EXTRACT_TABLE_STRUCTURE)
|
||||||
|
```
|
||||||
|
|
||||||
|
With enabled setting:
|
||||||
|
```sql
|
||||||
|
SET input_format_json_try_infer_named_tuples_from_objects = 1;
|
||||||
|
SET input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects = 1;
|
||||||
|
DESC format(JSONEachRow, '{"obj" : "a" : 42}, {"obj" : {"a" : {"b" : "Hello"}}}');
|
||||||
|
SELECT * FROM format(JSONEachRow, '{"obj" : {"a" : 42}}, {"obj" : {"a" : {"b" : "Hello"}}}');
|
||||||
|
```
|
||||||
|
|
||||||
|
Result:
|
||||||
|
```text
|
||||||
|
┌─name─┬─type──────────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
|
||||||
|
│ obj │ Tuple(a Nullable(String)) │ │ │ │ │ │
|
||||||
|
└──────┴───────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
||||||
|
┌─obj─────────────────┐
|
||||||
|
│ ('42') │
|
||||||
|
│ ('{"b" : "Hello"}') │
|
||||||
|
└─────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
##### input_format_json_read_objects_as_strings
|
##### input_format_json_read_objects_as_strings
|
||||||
|
|
||||||
Enabling this setting allows reading nested JSON objects as strings.
|
Enabling this setting allows reading nested JSON objects as strings.
|
||||||
@ -1554,6 +1596,28 @@ DESC format(JSONEachRow, $$
|
|||||||
└──────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
└──────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### input_format_try_infer_exponent_floats
|
||||||
|
|
||||||
|
If enabled, ClickHouse will try to infer floats in exponential form for text formats (except JSON where numbers in exponential form are always inferred).
|
||||||
|
|
||||||
|
Disabled by default.
|
||||||
|
|
||||||
|
**Example**
|
||||||
|
|
||||||
|
```sql
|
||||||
|
SET input_format_try_infer_exponent_floats = 1;
|
||||||
|
DESC format(CSV,
|
||||||
|
$$1.1E10
|
||||||
|
2.3e-12
|
||||||
|
42E00
|
||||||
|
$$)
|
||||||
|
```
|
||||||
|
```response
|
||||||
|
┌─name─┬─type──────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
|
||||||
|
│ c1 │ Nullable(Float64) │ │ │ │ │ │
|
||||||
|
└──────┴───────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
## Self describing formats {#self-describing-formats}
|
## Self describing formats {#self-describing-formats}
|
||||||
|
|
||||||
Self-describing formats contain information about the structure of the data in the data itself,
|
Self-describing formats contain information about the structure of the data in the data itself,
|
||||||
|
@ -1022,6 +1022,7 @@ class IColumn;
|
|||||||
M(Bool, input_format_json_read_objects_as_strings, true, "Allow to parse JSON objects as strings in JSON input formats", 0) \
|
M(Bool, input_format_json_read_objects_as_strings, true, "Allow to parse JSON objects as strings in JSON input formats", 0) \
|
||||||
M(Bool, input_format_json_read_arrays_as_strings, true, "Allow to parse JSON arrays as strings in JSON input formats", 0) \
|
M(Bool, input_format_json_read_arrays_as_strings, true, "Allow to parse JSON arrays as strings in JSON input formats", 0) \
|
||||||
M(Bool, input_format_json_try_infer_named_tuples_from_objects, true, "Try to infer named tuples from JSON objects in JSON input formats", 0) \
|
M(Bool, input_format_json_try_infer_named_tuples_from_objects, true, "Try to infer named tuples from JSON objects in JSON input formats", 0) \
|
||||||
|
M(Bool, input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects, false, "Use String type instead of an exception in case of ambiguous paths in JSON objects during named tuples inference", 0) \
|
||||||
M(Bool, input_format_json_infer_incomplete_types_as_strings, true, "Use type String for keys that contains only Nulls or empty objects/arrays during schema inference in JSON input formats", 0) \
|
M(Bool, input_format_json_infer_incomplete_types_as_strings, true, "Use type String for keys that contains only Nulls or empty objects/arrays during schema inference in JSON input formats", 0) \
|
||||||
M(Bool, input_format_json_named_tuples_as_objects, true, "Deserialize named tuple columns as JSON objects", 0) \
|
M(Bool, input_format_json_named_tuples_as_objects, true, "Deserialize named tuple columns as JSON objects", 0) \
|
||||||
M(Bool, input_format_json_ignore_unknown_keys_in_named_tuple, true, "Ignore unknown keys in json object for named tuples", 0) \
|
M(Bool, input_format_json_ignore_unknown_keys_in_named_tuple, true, "Ignore unknown keys in json object for named tuples", 0) \
|
||||||
@ -1029,7 +1030,7 @@ class IColumn;
|
|||||||
M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \
|
M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \
|
||||||
M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \
|
M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \
|
||||||
M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \
|
M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \
|
||||||
M(Bool, input_format_try_infer_exponent_floats, false, "Try to infer floats in exponential notation while schema inference in text formats", 0) \
|
M(Bool, input_format_try_infer_exponent_floats, false, "Try to infer floats in exponential notation while schema inference in text formats (except JSON, where exponent numbers are always inferred)", 0) \
|
||||||
M(Bool, output_format_markdown_escape_special_characters, false, "Escape special characters in Markdown", 0) \
|
M(Bool, output_format_markdown_escape_special_characters, false, "Escape special characters in Markdown", 0) \
|
||||||
M(Bool, input_format_protobuf_flatten_google_wrappers, false, "Enable Google wrappers for regular non-nested columns, e.g. google.protobuf.StringValue 'str' for String column 'str'. For Nullable columns empty wrappers are recognized as defaults, and missing as nulls", 0) \
|
M(Bool, input_format_protobuf_flatten_google_wrappers, false, "Enable Google wrappers for regular non-nested columns, e.g. google.protobuf.StringValue 'str' for String column 'str'. For Nullable columns empty wrappers are recognized as defaults, and missing as nulls", 0) \
|
||||||
M(Bool, output_format_protobuf_nullables_with_google_wrappers, false, "When serializing Nullable columns with Google wrappers, serialize default values as empty wrappers. If turned off, default and null values are not serialized", 0) \
|
M(Bool, output_format_protobuf_nullables_with_google_wrappers, false, "When serializing Nullable columns with Google wrappers, serialize default values as empty wrappers. If turned off, default and null values are not serialized", 0) \
|
||||||
|
@ -89,6 +89,7 @@ static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> sett
|
|||||||
{"use_page_cache_for_disks_without_file_cache", false, false, "Added userspace page cache"},
|
{"use_page_cache_for_disks_without_file_cache", false, false, "Added userspace page cache"},
|
||||||
{"read_from_page_cache_if_exists_otherwise_bypass_cache", false, false, "Added userspace page cache"},
|
{"read_from_page_cache_if_exists_otherwise_bypass_cache", false, false, "Added userspace page cache"},
|
||||||
{"page_cache_inject_eviction", false, false, "Added userspace page cache"},
|
{"page_cache_inject_eviction", false, false, "Added userspace page cache"},
|
||||||
|
{"input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects", false, false, "Allow to use String type for ambiguous paths during named tuple inference from JSON objects"},
|
||||||
}},
|
}},
|
||||||
{"24.2", {{"allow_suspicious_variant_types", true, false, "Don't allow creating Variant type with suspicious variants by default"},
|
{"24.2", {{"allow_suspicious_variant_types", true, false, "Don't allow creating Variant type with suspicious variants by default"},
|
||||||
{"validate_experimental_and_suspicious_types_inside_nested_types", false, true, "Validate usage of experimental and suspicious types inside nested types"},
|
{"validate_experimental_and_suspicious_types_inside_nested_types", false, true, "Validate usage of experimental and suspicious types inside nested types"},
|
||||||
|
@ -450,8 +450,10 @@ String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, Fo
|
|||||||
break;
|
break;
|
||||||
case FormatSettings::EscapingRule::JSON:
|
case FormatSettings::EscapingRule::JSON:
|
||||||
result += fmt::format(
|
result += fmt::format(
|
||||||
", try_infer_numbers_from_strings={}, read_bools_as_numbers={}, read_bools_as_strings={}, read_objects_as_strings={}, read_numbers_as_strings={}, "
|
", try_infer_numbers_from_strings={}, read_bools_as_numbers={}, read_bools_as_strings={}, read_objects_as_strings={}, "
|
||||||
"read_arrays_as_strings={}, try_infer_objects_as_tuples={}, infer_incomplete_types_as_strings={}, try_infer_objects={}",
|
"read_numbers_as_strings={}, "
|
||||||
|
"read_arrays_as_strings={}, try_infer_objects_as_tuples={}, infer_incomplete_types_as_strings={}, try_infer_objects={}, "
|
||||||
|
"use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects={}",
|
||||||
settings.json.try_infer_numbers_from_strings,
|
settings.json.try_infer_numbers_from_strings,
|
||||||
settings.json.read_bools_as_numbers,
|
settings.json.read_bools_as_numbers,
|
||||||
settings.json.read_bools_as_strings,
|
settings.json.read_bools_as_strings,
|
||||||
@ -460,7 +462,8 @@ String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, Fo
|
|||||||
settings.json.read_arrays_as_strings,
|
settings.json.read_arrays_as_strings,
|
||||||
settings.json.try_infer_objects_as_tuples,
|
settings.json.try_infer_objects_as_tuples,
|
||||||
settings.json.infer_incomplete_types_as_strings,
|
settings.json.infer_incomplete_types_as_strings,
|
||||||
settings.json.allow_object_type);
|
settings.json.allow_object_type,
|
||||||
|
settings.json.use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
|
@ -122,6 +122,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
|
|||||||
format_settings.json.write_named_tuples_as_objects = settings.output_format_json_named_tuples_as_objects;
|
format_settings.json.write_named_tuples_as_objects = settings.output_format_json_named_tuples_as_objects;
|
||||||
format_settings.json.skip_null_value_in_named_tuples = settings.output_format_json_skip_null_value_in_named_tuples;
|
format_settings.json.skip_null_value_in_named_tuples = settings.output_format_json_skip_null_value_in_named_tuples;
|
||||||
format_settings.json.read_named_tuples_as_objects = settings.input_format_json_named_tuples_as_objects;
|
format_settings.json.read_named_tuples_as_objects = settings.input_format_json_named_tuples_as_objects;
|
||||||
|
format_settings.json.use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects = settings.input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects;
|
||||||
format_settings.json.defaults_for_missing_elements_in_named_tuple = settings.input_format_json_defaults_for_missing_elements_in_named_tuple;
|
format_settings.json.defaults_for_missing_elements_in_named_tuple = settings.input_format_json_defaults_for_missing_elements_in_named_tuple;
|
||||||
format_settings.json.ignore_unknown_keys_in_named_tuple = settings.input_format_json_ignore_unknown_keys_in_named_tuple;
|
format_settings.json.ignore_unknown_keys_in_named_tuple = settings.input_format_json_ignore_unknown_keys_in_named_tuple;
|
||||||
format_settings.json.quote_64bit_integers = settings.output_format_json_quote_64bit_integers;
|
format_settings.json.quote_64bit_integers = settings.output_format_json_quote_64bit_integers;
|
||||||
|
@ -202,6 +202,7 @@ struct FormatSettings
|
|||||||
bool quote_decimals = false;
|
bool quote_decimals = false;
|
||||||
bool escape_forward_slashes = true;
|
bool escape_forward_slashes = true;
|
||||||
bool read_named_tuples_as_objects = false;
|
bool read_named_tuples_as_objects = false;
|
||||||
|
bool use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects = false;
|
||||||
bool write_named_tuples_as_objects = false;
|
bool write_named_tuples_as_objects = false;
|
||||||
bool skip_null_value_in_named_tuples = false;
|
bool skip_null_value_in_named_tuples = false;
|
||||||
bool defaults_for_missing_elements_in_named_tuple = false;
|
bool defaults_for_missing_elements_in_named_tuple = false;
|
||||||
|
@ -136,7 +136,7 @@ namespace
|
|||||||
|
|
||||||
bool empty() const { return paths.empty(); }
|
bool empty() const { return paths.empty(); }
|
||||||
|
|
||||||
DataTypePtr finalize() const
|
DataTypePtr finalize(bool use_string_type_for_ambiguous_paths = false) const
|
||||||
{
|
{
|
||||||
if (paths.empty())
|
if (paths.empty())
|
||||||
throw Exception(ErrorCodes::ONLY_NULLS_WHILE_READING_SCHEMA, "Cannot infer named Tuple from JSON object because object is empty");
|
throw Exception(ErrorCodes::ONLY_NULLS_WHILE_READING_SCHEMA, "Cannot infer named Tuple from JSON object because object is empty");
|
||||||
@ -167,7 +167,7 @@ namespace
|
|||||||
current_node->leaf_type = type;
|
current_node->leaf_type = type;
|
||||||
}
|
}
|
||||||
|
|
||||||
return root_node.getType();
|
return root_node.getType(use_string_type_for_ambiguous_paths);
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -180,19 +180,8 @@ namespace
|
|||||||
/// Store path to this node for better exception message in case of ambiguous paths.
|
/// Store path to this node for better exception message in case of ambiguous paths.
|
||||||
String path;
|
String path;
|
||||||
|
|
||||||
DataTypePtr getType() const
|
DataTypePtr getType(bool use_string_type_for_ambiguous_paths) const
|
||||||
{
|
{
|
||||||
/// Check if we have ambiguous paths.
|
|
||||||
/// For example:
|
|
||||||
/// 'a.b.c' : Int32 and 'a.b' : String
|
|
||||||
/// Also check if leaf type is Nothing, because the next situation is possible:
|
|
||||||
/// {"a" : {"b" : null}} -> 'a.b' : Nullable(Nothing)
|
|
||||||
/// {"a" : {"b" : {"c" : 42}}} -> 'a.b.c' : Int32
|
|
||||||
/// And after merge we will have ambiguous paths 'a.b.c' : Int32 and 'a.b' : Nullable(Nothing),
|
|
||||||
/// but it's a valid case and we should ignore path 'a.b'.
|
|
||||||
if (leaf_type && !isNothing(removeNullable(leaf_type)) && !nodes.empty())
|
|
||||||
throw Exception(ErrorCodes::INCORRECT_DATA, "JSON objects have ambiguous paths: '{}' with type {} and '{}'", path, leaf_type->getName(), nodes.begin()->second.path);
|
|
||||||
|
|
||||||
if (nodes.empty())
|
if (nodes.empty())
|
||||||
return leaf_type;
|
return leaf_type;
|
||||||
|
|
||||||
@ -203,10 +192,33 @@ namespace
|
|||||||
for (const auto & [name, node] : nodes)
|
for (const auto & [name, node] : nodes)
|
||||||
{
|
{
|
||||||
node_names.push_back(name);
|
node_names.push_back(name);
|
||||||
node_types.push_back(node.getType());
|
node_types.push_back(node.getType(use_string_type_for_ambiguous_paths));
|
||||||
}
|
}
|
||||||
|
|
||||||
return std::make_shared<DataTypeTuple>(std::move(node_types), std::move(node_names));
|
auto tuple_type = std::make_shared<DataTypeTuple>(std::move(node_types), std::move(node_names));
|
||||||
|
|
||||||
|
/// Check if we have ambiguous paths.
|
||||||
|
/// For example:
|
||||||
|
/// 'a.b.c' : Int32 and 'a.b' : String
|
||||||
|
/// Also check if leaf type is Nothing, because the next situation is possible:
|
||||||
|
/// {"a" : {"b" : null}} -> 'a.b' : Nullable(Nothing)
|
||||||
|
/// {"a" : {"b" : {"c" : 42}}} -> 'a.b.c' : Int32
|
||||||
|
/// And after merge we will have ambiguous paths 'a.b.c' : Int32 and 'a.b' : Nullable(Nothing),
|
||||||
|
/// but it's a valid case and we should ignore path 'a.b'.
|
||||||
|
if (leaf_type && !isNothing(removeNullable(leaf_type)) && !nodes.empty())
|
||||||
|
{
|
||||||
|
if (use_string_type_for_ambiguous_paths)
|
||||||
|
return std::make_shared<DataTypeString>();
|
||||||
|
|
||||||
|
throw Exception(
|
||||||
|
ErrorCodes::INCORRECT_DATA,
|
||||||
|
"JSON objects have ambiguous data: in some objects path '{}' has type '{}' and in some - '{}'. You can enable setting "
|
||||||
|
"input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects to use String type "
|
||||||
|
"for path '{}'",
|
||||||
|
path, leaf_type->getName(), tuple_type->getName(), path);
|
||||||
|
}
|
||||||
|
|
||||||
|
return tuple_type;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -866,13 +878,15 @@ namespace
|
|||||||
return std::make_shared<DataTypeTuple>(nested_types);
|
return std::make_shared<DataTypeTuple>(nested_types);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <bool is_json>
|
||||||
bool tryReadFloat(Float64 & value, ReadBuffer & buf, const FormatSettings & settings)
|
bool tryReadFloat(Float64 & value, ReadBuffer & buf, const FormatSettings & settings)
|
||||||
{
|
{
|
||||||
if (settings.try_infer_exponent_floats)
|
if (is_json || settings.try_infer_exponent_floats)
|
||||||
return tryReadFloatText(value, buf);
|
return tryReadFloatText(value, buf);
|
||||||
return tryReadFloatTextNoExponent(value, buf);
|
return tryReadFloatTextNoExponent(value, buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <bool is_json>
|
||||||
DataTypePtr tryInferNumber(ReadBuffer & buf, const FormatSettings & settings)
|
DataTypePtr tryInferNumber(ReadBuffer & buf, const FormatSettings & settings)
|
||||||
{
|
{
|
||||||
if (buf.eof())
|
if (buf.eof())
|
||||||
@ -911,7 +925,7 @@ namespace
|
|||||||
buf.position() = number_start;
|
buf.position() = number_start;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tryReadFloat(tmp_float, buf, settings))
|
if (tryReadFloat<is_json>(tmp_float, buf, settings))
|
||||||
{
|
{
|
||||||
if (read_int && buf.position() == int_end)
|
if (read_int && buf.position() == int_end)
|
||||||
return std::make_shared<DataTypeInt64>();
|
return std::make_shared<DataTypeInt64>();
|
||||||
@ -945,7 +959,7 @@ namespace
|
|||||||
peekable_buf.rollbackToCheckpoint(true);
|
peekable_buf.rollbackToCheckpoint(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tryReadFloat(tmp_float, peekable_buf, settings))
|
if (tryReadFloat<is_json>(tmp_float, peekable_buf, settings))
|
||||||
{
|
{
|
||||||
/// Float parsing reads no fewer bytes than integer parsing,
|
/// Float parsing reads no fewer bytes than integer parsing,
|
||||||
/// so position of the buffer is either the same, or further.
|
/// so position of the buffer is either the same, or further.
|
||||||
@ -957,7 +971,7 @@ namespace
|
|||||||
return std::make_shared<DataTypeFloat64>();
|
return std::make_shared<DataTypeFloat64>();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (tryReadFloat(tmp_float, buf, settings))
|
else if (tryReadFloat<is_json>(tmp_float, buf, settings))
|
||||||
{
|
{
|
||||||
return std::make_shared<DataTypeFloat64>();
|
return std::make_shared<DataTypeFloat64>();
|
||||||
}
|
}
|
||||||
@ -966,6 +980,36 @@ namespace
|
|||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <bool is_json>
|
||||||
|
DataTypePtr tryInferNumberFromStringImpl(std::string_view field, const FormatSettings & settings)
|
||||||
|
{
|
||||||
|
ReadBufferFromString buf(field);
|
||||||
|
|
||||||
|
if (settings.try_infer_integers)
|
||||||
|
{
|
||||||
|
Int64 tmp_int;
|
||||||
|
if (tryReadIntText(tmp_int, buf) && buf.eof())
|
||||||
|
return std::make_shared<DataTypeInt64>();
|
||||||
|
|
||||||
|
/// We can safely get back to the start of buffer, because we read from a string and we didn't reach eof.
|
||||||
|
buf.position() = buf.buffer().begin();
|
||||||
|
|
||||||
|
/// In case of Int64 overflow, try to infer UInt64
|
||||||
|
UInt64 tmp_uint;
|
||||||
|
if (tryReadIntText(tmp_uint, buf) && buf.eof())
|
||||||
|
return std::make_shared<DataTypeUInt64>();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// We can safely get back to the start of buffer, because we read from a string and we didn't reach eof.
|
||||||
|
buf.position() = buf.buffer().begin();
|
||||||
|
|
||||||
|
Float64 tmp;
|
||||||
|
if (tryReadFloat<is_json>(tmp, buf, settings) && buf.eof())
|
||||||
|
return std::make_shared<DataTypeFloat64>();
|
||||||
|
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
template <bool is_json>
|
template <bool is_json>
|
||||||
DataTypePtr tryInferString(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info)
|
DataTypePtr tryInferString(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info)
|
||||||
{
|
{
|
||||||
@ -995,7 +1039,7 @@ namespace
|
|||||||
{
|
{
|
||||||
if (settings.json.try_infer_numbers_from_strings)
|
if (settings.json.try_infer_numbers_from_strings)
|
||||||
{
|
{
|
||||||
if (auto number_type = tryInferNumberFromString(field, settings))
|
if (auto number_type = tryInferNumberFromStringImpl<true>(field, settings))
|
||||||
{
|
{
|
||||||
json_info->numbers_parsed_from_json_strings.insert(number_type.get());
|
json_info->numbers_parsed_from_json_strings.insert(number_type.get());
|
||||||
return number_type;
|
return number_type;
|
||||||
@ -1238,7 +1282,7 @@ namespace
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Number
|
/// Number
|
||||||
return tryInferNumber(buf, settings);
|
return tryInferNumber<is_json>(buf, settings);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1294,7 +1338,7 @@ void transformFinalInferredJSONTypeIfNeededImpl(DataTypePtr & data_type, const F
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
data_type = json_paths->finalize();
|
data_type = json_paths->finalize(settings.json.use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects);
|
||||||
transformFinalInferredJSONTypeIfNeededImpl(data_type, settings, json_info, remain_nothing_types);
|
transformFinalInferredJSONTypeIfNeededImpl(data_type, settings, json_info, remain_nothing_types);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -1377,31 +1421,7 @@ void transformFinalInferredJSONTypeIfNeeded(DataTypePtr & data_type, const Forma
|
|||||||
|
|
||||||
DataTypePtr tryInferNumberFromString(std::string_view field, const FormatSettings & settings)
|
DataTypePtr tryInferNumberFromString(std::string_view field, const FormatSettings & settings)
|
||||||
{
|
{
|
||||||
ReadBufferFromString buf(field);
|
return tryInferNumberFromStringImpl<false>(field, settings);
|
||||||
|
|
||||||
if (settings.try_infer_integers)
|
|
||||||
{
|
|
||||||
Int64 tmp_int;
|
|
||||||
if (tryReadIntText(tmp_int, buf) && buf.eof())
|
|
||||||
return std::make_shared<DataTypeInt64>();
|
|
||||||
|
|
||||||
/// We can safely get back to the start of buffer, because we read from a string and we didn't reach eof.
|
|
||||||
buf.position() = buf.buffer().begin();
|
|
||||||
|
|
||||||
/// In case of Int64 overflow, try to infer UInt64
|
|
||||||
UInt64 tmp_uint;
|
|
||||||
if (tryReadIntText(tmp_uint, buf) && buf.eof())
|
|
||||||
return std::make_shared<DataTypeUInt64>();
|
|
||||||
}
|
|
||||||
|
|
||||||
/// We can safely get back to the start of buffer, because we read from a string and we didn't reach eof.
|
|
||||||
buf.position() = buf.buffer().begin();
|
|
||||||
|
|
||||||
Float64 tmp;
|
|
||||||
if (tryReadFloat(tmp, buf, settings) && buf.eof())
|
|
||||||
return std::make_shared<DataTypeFloat64>();
|
|
||||||
|
|
||||||
return nullptr;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
DataTypePtr tryInferDateOrDateTimeFromString(std::string_view field, const FormatSettings & settings)
|
DataTypePtr tryInferDateOrDateTimeFromString(std::string_view field, const FormatSettings & settings)
|
||||||
|
@ -1,2 +1,3 @@
|
|||||||
c1 Nullable(String)
|
c1 Nullable(String)
|
||||||
c1 Nullable(Float64)
|
c1 Nullable(Float64)
|
||||||
|
x Nullable(Float64)
|
||||||
|
@ -1,2 +1,5 @@
|
|||||||
DESC format(CSV, '1E20\n1.1E20') settings input_format_try_infer_exponent_floats = 0;
|
DESC format(CSV, '1E20\n1.1E20') settings input_format_try_infer_exponent_floats = 0;
|
||||||
DESC format(CSV, '1E20\n1.1E20') settings input_format_try_infer_exponent_floats = 1;
|
DESC format(CSV, '1E20\n1.1E20') settings input_format_try_infer_exponent_floats = 1;
|
||||||
|
-- This setting should not take affect on JSON formats
|
||||||
|
DESC format(JSONEachRow, '{"x" : 1.1e20}') settings input_format_try_infer_exponent_floats = 0;
|
||||||
|
|
||||||
|
@ -0,0 +1,3 @@
|
|||||||
|
obj Tuple(\n a Nullable(String))
|
||||||
|
('42')
|
||||||
|
('{"b" : 42}')
|
@ -0,0 +1,4 @@
|
|||||||
|
set input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects=1;
|
||||||
|
desc format(JSONEachRow, '{"obj" : {"a" : 42}}, {"obj" : {"a" : {"b" : 42}}}');
|
||||||
|
select * from format(JSONEachRow, '{"obj" : {"a" : 42}}, {"obj" : {"a" : {"b" : 42}}}');
|
||||||
|
|
Loading…
Reference in New Issue
Block a user