From 70abdf7a414ee57d59df51f6cf5ec435e2830f9e Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 4 Mar 2024 17:32:22 +0000 Subject: [PATCH 1/2] Small improvements in JSON schema inference --- docs/en/interfaces/schema-inference.md | 64 +++++++++++++ src/Core/Settings.h | 3 +- src/Core/SettingsChangesHistory.h | 3 + src/Formats/EscapingRuleUtils.cpp | 9 +- src/Formats/FormatFactory.cpp | 1 + src/Formats/FormatSettings.h | 1 + src/Formats/SchemaInferenceUtils.cpp | 91 +++++++++++-------- ...02982_dont_infer_exponent_floats.reference | 1 + .../02982_dont_infer_exponent_floats.sql | 3 + ...erence_ambiguous_paths_as_string.reference | 3 + ...es_inference_ambiguous_paths_as_string.sql | 4 + 11 files changed, 142 insertions(+), 41 deletions(-) create mode 100644 tests/queries/0_stateless/03004_json_named_tuples_inference_ambiguous_paths_as_string.reference create mode 100644 tests/queries/0_stateless/03004_json_named_tuples_inference_ambiguous_paths_as_string.sql diff --git a/docs/en/interfaces/schema-inference.md b/docs/en/interfaces/schema-inference.md index 39ae69eaef4..f2e9136d1db 100644 --- a/docs/en/interfaces/schema-inference.md +++ b/docs/en/interfaces/schema-inference.md @@ -549,6 +549,48 @@ Result: └───────┴─────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ ``` +##### input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects + +Enabling this setting allows to use String type for ambiguous paths during named tuples inference from JSON objects (when `input_format_json_try_infer_named_tuples_from_objects` is enabled) instead of an exception. +It allows to read JSON objects as named Tuples even if there are ambiguous paths. + +Disabled by default. + +**Examples** + +With disabled setting: +```sql +SET input_format_json_try_infer_named_tuples_from_objects = 1; +SET input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects = 0; +DESC format(JSONEachRow, '{"obj" : {"a" : 42}}, {"obj" : {"a" : {"b" : "Hello"}}}'); +``` +Result: + +```text +Code: 636. DB::Exception: The table structure cannot be extracted from a JSONEachRow format file. Error: +Code: 117. DB::Exception: JSON objects have ambiguous paths: 'a' (with type Int64) and 'a.b'. You can enable setting input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects to use String type for path 'a'. (INCORRECT_DATA) (version 24.3.1.1). +You can specify the structure manually. (CANNOT_EXTRACT_TABLE_STRUCTURE) +``` + +With enabled setting: +```sql +SET input_format_json_try_infer_named_tuples_from_objects = 1; +SET input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects = 1; +DESC format(JSONEachRow, '{"obj" : "a" : 42}, {"obj" : {"a" : {"b" : "Hello"}}}'); +SELECT * FROM format(JSONEachRow, '{"obj" : {"a" : 42}}, {"obj" : {"a" : {"b" : "Hello"}}}'); +``` + +Result: +```text +┌─name─┬─type──────────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐ +│ obj │ Tuple(a Nullable(String)) │ │ │ │ │ │ +└──────┴───────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ +┌─obj─────────────────┐ +│ ('42') │ +│ ('{"b" : "Hello"}') │ +└─────────────────────┘ +``` + ##### input_format_json_read_objects_as_strings Enabling this setting allows reading nested JSON objects as strings. @@ -1554,6 +1596,28 @@ DESC format(JSONEachRow, $$ └──────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ ``` +#### input_format_try_infer_exponent_floats + +If enabled, ClickHouse will try to infer floats in exponential form for text formats (except JSON where numbers in exponential form are always inferred). + +Disabled by default. + +**Example** + +```sql +SET input_format_try_infer_exponent_floats = 1; +DESC format(CSV, +$$1.1E10 +2.3e-12 +42E00 +$$) +``` +```response +┌─name─┬─type──────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐ +│ c1 │ Nullable(Float64) │ │ │ │ │ │ +└──────┴───────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ +``` + ## Self describing formats {#self-describing-formats} Self-describing formats contain information about the structure of the data in the data itself, diff --git a/src/Core/Settings.h b/src/Core/Settings.h index ae6ea165cc9..3f71223c910 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1018,6 +1018,7 @@ class IColumn; M(Bool, input_format_json_read_objects_as_strings, true, "Allow to parse JSON objects as strings in JSON input formats", 0) \ M(Bool, input_format_json_read_arrays_as_strings, true, "Allow to parse JSON arrays as strings in JSON input formats", 0) \ M(Bool, input_format_json_try_infer_named_tuples_from_objects, true, "Try to infer named tuples from JSON objects in JSON input formats", 0) \ + M(Bool, input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects, false, "Use String type instead of an exception in case of ambiguous paths in JSON objects during named tuples inference", 0) \ M(Bool, input_format_json_infer_incomplete_types_as_strings, true, "Use type String for keys that contains only Nulls or empty objects/arrays during schema inference in JSON input formats", 0) \ M(Bool, input_format_json_named_tuples_as_objects, true, "Deserialize named tuple columns as JSON objects", 0) \ M(Bool, input_format_json_ignore_unknown_keys_in_named_tuple, true, "Ignore unknown keys in json object for named tuples", 0) \ @@ -1025,7 +1026,7 @@ class IColumn; M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \ M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \ M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \ - M(Bool, input_format_try_infer_exponent_floats, false, "Try to infer floats in exponential notation while schema inference in text formats", 0) \ + M(Bool, input_format_try_infer_exponent_floats, false, "Try to infer floats in exponential notation while schema inference in text formats (except JSON, where exponent numbers are always inferred)", 0) \ M(Bool, output_format_markdown_escape_special_characters, false, "Escape special characters in Markdown", 0) \ M(Bool, input_format_protobuf_flatten_google_wrappers, false, "Enable Google wrappers for regular non-nested columns, e.g. google.protobuf.StringValue 'str' for String column 'str'. For Nullable columns empty wrappers are recognized as defaults, and missing as nulls", 0) \ M(Bool, output_format_protobuf_nullables_with_google_wrappers, false, "When serializing Nullable columns with Google wrappers, serialize default values as empty wrappers. If turned off, default and null values are not serialized", 0) \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index face1def4b4..f473d677ecd 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -85,6 +85,9 @@ namespace SettingsChangesHistory /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972) static std::map settings_changes_history = { + {"24.3", { + {"input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects", false, false, "Allow to use String type for ambiguous paths during named tuple inference from JSON objects"}, + }}, {"24.2", { {"allow_suspicious_variant_types", true, false, "Don't allow creating Variant type with suspicious variants by default"}, {"validate_experimental_and_suspicious_types_inside_nested_types", false, true, "Validate usage of experimental and suspicious types inside nested types"}, diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index 16f8a341e03..577988871f3 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -450,8 +450,10 @@ String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, Fo break; case FormatSettings::EscapingRule::JSON: result += fmt::format( - ", try_infer_numbers_from_strings={}, read_bools_as_numbers={}, read_bools_as_strings={}, read_objects_as_strings={}, read_numbers_as_strings={}, " - "read_arrays_as_strings={}, try_infer_objects_as_tuples={}, infer_incomplete_types_as_strings={}, try_infer_objects={}", + ", try_infer_numbers_from_strings={}, read_bools_as_numbers={}, read_bools_as_strings={}, read_objects_as_strings={}, " + "read_numbers_as_strings={}, " + "read_arrays_as_strings={}, try_infer_objects_as_tuples={}, infer_incomplete_types_as_strings={}, try_infer_objects={}, " + "use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects={}", settings.json.try_infer_numbers_from_strings, settings.json.read_bools_as_numbers, settings.json.read_bools_as_strings, @@ -460,7 +462,8 @@ String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, Fo settings.json.read_arrays_as_strings, settings.json.try_infer_objects_as_tuples, settings.json.infer_incomplete_types_as_strings, - settings.json.allow_object_type); + settings.json.allow_object_type, + settings.json.use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects); break; default: break; diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index a4a08d762b9..ccead6688a7 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -105,6 +105,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se format_settings.json.write_named_tuples_as_objects = settings.output_format_json_named_tuples_as_objects; format_settings.json.skip_null_value_in_named_tuples = settings.output_format_json_skip_null_value_in_named_tuples; format_settings.json.read_named_tuples_as_objects = settings.input_format_json_named_tuples_as_objects; + format_settings.json.use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects = settings.input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects; format_settings.json.defaults_for_missing_elements_in_named_tuple = settings.input_format_json_defaults_for_missing_elements_in_named_tuple; format_settings.json.ignore_unknown_keys_in_named_tuple = settings.input_format_json_ignore_unknown_keys_in_named_tuple; format_settings.json.quote_64bit_integers = settings.output_format_json_quote_64bit_integers; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 01c3632c730..42b21c77cef 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -202,6 +202,7 @@ struct FormatSettings bool quote_decimals = false; bool escape_forward_slashes = true; bool read_named_tuples_as_objects = false; + bool use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects = false; bool write_named_tuples_as_objects = false; bool skip_null_value_in_named_tuples = false; bool defaults_for_missing_elements_in_named_tuple = false; diff --git a/src/Formats/SchemaInferenceUtils.cpp b/src/Formats/SchemaInferenceUtils.cpp index 06b52e7a7a2..998f97fae0d 100644 --- a/src/Formats/SchemaInferenceUtils.cpp +++ b/src/Formats/SchemaInferenceUtils.cpp @@ -136,7 +136,7 @@ namespace bool empty() const { return paths.empty(); } - DataTypePtr finalize() const + DataTypePtr finalize(bool use_string_type_for_ambiguous_paths = false) const { if (paths.empty()) throw Exception(ErrorCodes::ONLY_NULLS_WHILE_READING_SCHEMA, "Cannot infer named Tuple from JSON object because object is empty"); @@ -167,7 +167,7 @@ namespace current_node->leaf_type = type; } - return root_node.getType(); + return root_node.getType(use_string_type_for_ambiguous_paths); } private: @@ -180,7 +180,7 @@ namespace /// Store path to this node for better exception message in case of ambiguous paths. String path; - DataTypePtr getType() const + DataTypePtr getType(bool use_string_type_for_ambiguous_paths) const { /// Check if we have ambiguous paths. /// For example: @@ -191,7 +191,16 @@ namespace /// And after merge we will have ambiguous paths 'a.b.c' : Int32 and 'a.b' : Nullable(Nothing), /// but it's a valid case and we should ignore path 'a.b'. if (leaf_type && !isNothing(removeNullable(leaf_type)) && !nodes.empty()) - throw Exception(ErrorCodes::INCORRECT_DATA, "JSON objects have ambiguous paths: '{}' with type {} and '{}'", path, leaf_type->getName(), nodes.begin()->second.path); + { + if (use_string_type_for_ambiguous_paths) + return std::make_shared(); + throw Exception( + ErrorCodes::INCORRECT_DATA, + "JSON objects have ambiguous paths: '{}' (with type {}) and '{}'. You can enable setting " + "input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects to use String type " + "for path '{}'", + path, leaf_type->getName(), nodes.begin()->second.path, path); + } if (nodes.empty()) return leaf_type; @@ -203,7 +212,7 @@ namespace for (const auto & [name, node] : nodes) { node_names.push_back(name); - node_types.push_back(node.getType()); + node_types.push_back(node.getType(use_string_type_for_ambiguous_paths)); } return std::make_shared(std::move(node_types), std::move(node_names)); @@ -866,13 +875,15 @@ namespace return std::make_shared(nested_types); } + template bool tryReadFloat(Float64 & value, ReadBuffer & buf, const FormatSettings & settings) { - if (settings.try_infer_exponent_floats) + if (is_json || settings.try_infer_exponent_floats) return tryReadFloatText(value, buf); return tryReadFloatTextNoExponent(value, buf); } + template DataTypePtr tryInferNumber(ReadBuffer & buf, const FormatSettings & settings) { if (buf.eof()) @@ -911,7 +922,7 @@ namespace buf.position() = number_start; } - if (tryReadFloat(tmp_float, buf, settings)) + if (tryReadFloat(tmp_float, buf, settings)) { if (read_int && buf.position() == int_end) return std::make_shared(); @@ -945,7 +956,7 @@ namespace peekable_buf.rollbackToCheckpoint(true); } - if (tryReadFloat(tmp_float, peekable_buf, settings)) + if (tryReadFloat(tmp_float, peekable_buf, settings)) { /// Float parsing reads no fewer bytes than integer parsing, /// so position of the buffer is either the same, or further. @@ -957,7 +968,7 @@ namespace return std::make_shared(); } } - else if (tryReadFloat(tmp_float, buf, settings)) + else if (tryReadFloat(tmp_float, buf, settings)) { return std::make_shared(); } @@ -966,6 +977,36 @@ namespace return nullptr; } + template + DataTypePtr tryInferNumberFromStringImpl(std::string_view field, const FormatSettings & settings) + { + ReadBufferFromString buf(field); + + if (settings.try_infer_integers) + { + Int64 tmp_int; + if (tryReadIntText(tmp_int, buf) && buf.eof()) + return std::make_shared(); + + /// We can safely get back to the start of buffer, because we read from a string and we didn't reach eof. + buf.position() = buf.buffer().begin(); + + /// In case of Int64 overflow, try to infer UInt64 + UInt64 tmp_uint; + if (tryReadIntText(tmp_uint, buf) && buf.eof()) + return std::make_shared(); + } + + /// We can safely get back to the start of buffer, because we read from a string and we didn't reach eof. + buf.position() = buf.buffer().begin(); + + Float64 tmp; + if (tryReadFloat(tmp, buf, settings) && buf.eof()) + return std::make_shared(); + + return nullptr; + } + template DataTypePtr tryInferString(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info) { @@ -995,7 +1036,7 @@ namespace { if (settings.json.try_infer_numbers_from_strings) { - if (auto number_type = tryInferNumberFromString(field, settings)) + if (auto number_type = tryInferNumberFromStringImpl(field, settings)) { json_info->numbers_parsed_from_json_strings.insert(number_type.get()); return number_type; @@ -1238,7 +1279,7 @@ namespace } /// Number - return tryInferNumber(buf, settings); + return tryInferNumber(buf, settings); } } @@ -1294,7 +1335,7 @@ void transformFinalInferredJSONTypeIfNeededImpl(DataTypePtr & data_type, const F return; } - data_type = json_paths->finalize(); + data_type = json_paths->finalize(settings.json.use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects); transformFinalInferredJSONTypeIfNeededImpl(data_type, settings, json_info, remain_nothing_types); return; } @@ -1377,31 +1418,7 @@ void transformFinalInferredJSONTypeIfNeeded(DataTypePtr & data_type, const Forma DataTypePtr tryInferNumberFromString(std::string_view field, const FormatSettings & settings) { - ReadBufferFromString buf(field); - - if (settings.try_infer_integers) - { - Int64 tmp_int; - if (tryReadIntText(tmp_int, buf) && buf.eof()) - return std::make_shared(); - - /// We can safely get back to the start of buffer, because we read from a string and we didn't reach eof. - buf.position() = buf.buffer().begin(); - - /// In case of Int64 overflow, try to infer UInt64 - UInt64 tmp_uint; - if (tryReadIntText(tmp_uint, buf) && buf.eof()) - return std::make_shared(); - } - - /// We can safely get back to the start of buffer, because we read from a string and we didn't reach eof. - buf.position() = buf.buffer().begin(); - - Float64 tmp; - if (tryReadFloat(tmp, buf, settings) && buf.eof()) - return std::make_shared(); - - return nullptr; + return tryInferNumberFromStringImpl(field, settings); } DataTypePtr tryInferDateOrDateTimeFromString(std::string_view field, const FormatSettings & settings) diff --git a/tests/queries/0_stateless/02982_dont_infer_exponent_floats.reference b/tests/queries/0_stateless/02982_dont_infer_exponent_floats.reference index b6d1ff865e5..47e9b86237a 100644 --- a/tests/queries/0_stateless/02982_dont_infer_exponent_floats.reference +++ b/tests/queries/0_stateless/02982_dont_infer_exponent_floats.reference @@ -1,2 +1,3 @@ c1 Nullable(String) c1 Nullable(Float64) +x Nullable(Float64) diff --git a/tests/queries/0_stateless/02982_dont_infer_exponent_floats.sql b/tests/queries/0_stateless/02982_dont_infer_exponent_floats.sql index 2a281e898f1..4f78855f5ce 100644 --- a/tests/queries/0_stateless/02982_dont_infer_exponent_floats.sql +++ b/tests/queries/0_stateless/02982_dont_infer_exponent_floats.sql @@ -1,2 +1,5 @@ DESC format(CSV, '1E20\n1.1E20') settings input_format_try_infer_exponent_floats = 0; DESC format(CSV, '1E20\n1.1E20') settings input_format_try_infer_exponent_floats = 1; +-- This setting should not take affect on JSON formats +DESC format(JSONEachRow, '{"x" : 1.1e20}') settings input_format_try_infer_exponent_floats = 0; + diff --git a/tests/queries/0_stateless/03004_json_named_tuples_inference_ambiguous_paths_as_string.reference b/tests/queries/0_stateless/03004_json_named_tuples_inference_ambiguous_paths_as_string.reference new file mode 100644 index 00000000000..0318b136ade --- /dev/null +++ b/tests/queries/0_stateless/03004_json_named_tuples_inference_ambiguous_paths_as_string.reference @@ -0,0 +1,3 @@ +obj Tuple(\n a Nullable(String)) +('42') +('{"b" : 42}') diff --git a/tests/queries/0_stateless/03004_json_named_tuples_inference_ambiguous_paths_as_string.sql b/tests/queries/0_stateless/03004_json_named_tuples_inference_ambiguous_paths_as_string.sql new file mode 100644 index 00000000000..4b986c94868 --- /dev/null +++ b/tests/queries/0_stateless/03004_json_named_tuples_inference_ambiguous_paths_as_string.sql @@ -0,0 +1,4 @@ +set input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects=1; +desc format(JSONEachRow, '{"obj" : {"a" : 42}}, {"obj" : {"a" : {"b" : 42}}}'); +select * from format(JSONEachRow, '{"obj" : {"a" : 42}}, {"obj" : {"a" : {"b" : 42}}}'); + From 9a0546168094d38692725f89677077e32bd144b5 Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 4 Mar 2024 17:49:33 +0000 Subject: [PATCH 2/2] Better exception message --- docs/en/interfaces/schema-inference.md | 2 +- src/Formats/SchemaInferenceUtils.cpp | 45 ++++++++++++++------------ 2 files changed, 25 insertions(+), 22 deletions(-) diff --git a/docs/en/interfaces/schema-inference.md b/docs/en/interfaces/schema-inference.md index f2e9136d1db..05fae994cbe 100644 --- a/docs/en/interfaces/schema-inference.md +++ b/docs/en/interfaces/schema-inference.md @@ -568,7 +568,7 @@ Result: ```text Code: 636. DB::Exception: The table structure cannot be extracted from a JSONEachRow format file. Error: -Code: 117. DB::Exception: JSON objects have ambiguous paths: 'a' (with type Int64) and 'a.b'. You can enable setting input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects to use String type for path 'a'. (INCORRECT_DATA) (version 24.3.1.1). +Code: 117. DB::Exception: JSON objects have ambiguous data: in some objects path 'a' has type 'Int64' and in some - 'Tuple(b String)'. You can enable setting input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects to use String type for path 'a'. (INCORRECT_DATA) (version 24.3.1.1). You can specify the structure manually. (CANNOT_EXTRACT_TABLE_STRUCTURE) ``` diff --git a/src/Formats/SchemaInferenceUtils.cpp b/src/Formats/SchemaInferenceUtils.cpp index 998f97fae0d..cb574551d26 100644 --- a/src/Formats/SchemaInferenceUtils.cpp +++ b/src/Formats/SchemaInferenceUtils.cpp @@ -182,26 +182,6 @@ namespace DataTypePtr getType(bool use_string_type_for_ambiguous_paths) const { - /// Check if we have ambiguous paths. - /// For example: - /// 'a.b.c' : Int32 and 'a.b' : String - /// Also check if leaf type is Nothing, because the next situation is possible: - /// {"a" : {"b" : null}} -> 'a.b' : Nullable(Nothing) - /// {"a" : {"b" : {"c" : 42}}} -> 'a.b.c' : Int32 - /// And after merge we will have ambiguous paths 'a.b.c' : Int32 and 'a.b' : Nullable(Nothing), - /// but it's a valid case and we should ignore path 'a.b'. - if (leaf_type && !isNothing(removeNullable(leaf_type)) && !nodes.empty()) - { - if (use_string_type_for_ambiguous_paths) - return std::make_shared(); - throw Exception( - ErrorCodes::INCORRECT_DATA, - "JSON objects have ambiguous paths: '{}' (with type {}) and '{}'. You can enable setting " - "input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects to use String type " - "for path '{}'", - path, leaf_type->getName(), nodes.begin()->second.path, path); - } - if (nodes.empty()) return leaf_type; @@ -215,7 +195,30 @@ namespace node_types.push_back(node.getType(use_string_type_for_ambiguous_paths)); } - return std::make_shared(std::move(node_types), std::move(node_names)); + auto tuple_type = std::make_shared(std::move(node_types), std::move(node_names)); + + /// Check if we have ambiguous paths. + /// For example: + /// 'a.b.c' : Int32 and 'a.b' : String + /// Also check if leaf type is Nothing, because the next situation is possible: + /// {"a" : {"b" : null}} -> 'a.b' : Nullable(Nothing) + /// {"a" : {"b" : {"c" : 42}}} -> 'a.b.c' : Int32 + /// And after merge we will have ambiguous paths 'a.b.c' : Int32 and 'a.b' : Nullable(Nothing), + /// but it's a valid case and we should ignore path 'a.b'. + if (leaf_type && !isNothing(removeNullable(leaf_type)) && !nodes.empty()) + { + if (use_string_type_for_ambiguous_paths) + return std::make_shared(); + + throw Exception( + ErrorCodes::INCORRECT_DATA, + "JSON objects have ambiguous data: in some objects path '{}' has type '{}' and in some - '{}'. You can enable setting " + "input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects to use String type " + "for path '{}'", + path, leaf_type->getName(), tuple_type->getName(), path); + } + + return tuple_type; } };