Small improvements in JSON schema inference

2024-11-22 15:42:02 +00:00 · 2024-03-04 17:32:22 +00:00 · 2024-03-04 17:32:22 +00:00 · 70abdf7a41
commit 70abdf7a41
parent fb02137fcc
11 changed files with 142 additions and 41 deletions
--- a/docs/en/interfaces/schema-inference.md
+++ b/docs/en/interfaces/schema-inference.md
@ -549,6 +549,48 @@ Result:
 └───────┴─────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
 ```

+##### input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects
+
+Enabling this setting allows to use String type for ambiguous paths during named tuples inference from JSON objects (when `input_format_json_try_infer_named_tuples_from_objects` is enabled) instead of an exception.
+It allows to read JSON objects as named Tuples even if there are ambiguous paths.
+
+Disabled by default.
+
+**Examples**
+
+With disabled setting:
+```sql
+SET input_format_json_try_infer_named_tuples_from_objects = 1;
+SET input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects = 0;
+DESC format(JSONEachRow, '{"obj" : {"a" : 42}}, {"obj" : {"a" : {"b" : "Hello"}}}');
+```
+Result:
+
+```text
+Code: 636. DB::Exception: The table structure cannot be extracted from a JSONEachRow format file. Error:
+Code: 117. DB::Exception: JSON objects have ambiguous paths: 'a' (with type Int64) and 'a.b'. You can enable setting input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects to use String type for path 'a'. (INCORRECT_DATA) (version 24.3.1.1).
+You can specify the structure manually. (CANNOT_EXTRACT_TABLE_STRUCTURE)
+```
+
+With enabled setting:
+```sql
+SET input_format_json_try_infer_named_tuples_from_objects = 1;
+SET input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects = 1;
+DESC format(JSONEachRow, '{"obj" : "a" : 42}, {"obj" : {"a" : {"b" : "Hello"}}}');
+SELECT * FROM format(JSONEachRow, '{"obj" : {"a" : 42}}, {"obj" : {"a" : {"b" : "Hello"}}}');
+```
+
+Result:
+```text
+┌─name─┬─type──────────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
+│ obj  │ Tuple(a Nullable(String))     │              │                    │         │                  │                │
+└──────┴───────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
+┌─obj─────────────────┐
+│ ('42')              │
+│ ('{"b" : "Hello"}') │
+└─────────────────────┘
+```
+
 ##### input_format_json_read_objects_as_strings

 Enabling this setting allows reading nested JSON objects as strings.
@ -1554,6 +1596,28 @@ DESC format(JSONEachRow, $$
 └──────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
 ```

+#### input_format_try_infer_exponent_floats
+
+If enabled, ClickHouse will try to infer floats in exponential form for text formats (except JSON where numbers in exponential form are always inferred).
+
+Disabled by default.
+
+**Example**
+
+```sql
+SET input_format_try_infer_exponent_floats = 1;
+DESC format(CSV,
+$$1.1E10
+2.3e-12
+42E00
+$$)
+```
+```response
+┌─name─┬─type──────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
+│ c1   │ Nullable(Float64) │              │                    │         │                  │                │
+└──────┴───────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
+```
+
 ## Self describing formats {#self-describing-formats}

 Self-describing formats contain information about the structure of the data in the data itself,
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -1018,6 +1018,7 @@ class IColumn;
    M(Bool, input_format_json_read_objects_as_strings, true, "Allow to parse JSON objects as strings in JSON input formats", 0) \
    M(Bool, input_format_json_read_arrays_as_strings, true, "Allow to parse JSON arrays as strings in JSON input formats", 0) \
    M(Bool, input_format_json_try_infer_named_tuples_from_objects, true, "Try to infer named tuples from JSON objects in JSON input formats", 0) \
+    M(Bool, input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects, false, "Use String type instead of an exception in case of ambiguous paths in JSON objects during named tuples inference", 0) \
    M(Bool, input_format_json_infer_incomplete_types_as_strings, true, "Use type String for keys that contains only Nulls or empty objects/arrays during schema inference in JSON input formats", 0) \
    M(Bool, input_format_json_named_tuples_as_objects, true, "Deserialize named tuple columns as JSON objects", 0) \
    M(Bool, input_format_json_ignore_unknown_keys_in_named_tuple, true, "Ignore unknown keys in json object for named tuples", 0) \
@ -1025,7 +1026,7 @@ class IColumn;
    M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \
    M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \
    M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \
-    M(Bool, input_format_try_infer_exponent_floats, false, "Try to infer floats in exponential notation while schema inference in text formats", 0) \
+    M(Bool, input_format_try_infer_exponent_floats, false, "Try to infer floats in exponential notation while schema inference in text formats (except JSON, where exponent numbers are always inferred)", 0) \
    M(Bool, output_format_markdown_escape_special_characters, false, "Escape special characters in Markdown", 0) \
    M(Bool, input_format_protobuf_flatten_google_wrappers, false, "Enable Google wrappers for regular non-nested columns, e.g. google.protobuf.StringValue 'str' for String column 'str'. For Nullable columns empty wrappers are recognized as defaults, and missing as nulls", 0) \
    M(Bool, output_format_protobuf_nullables_with_google_wrappers, false, "When serializing Nullable columns with Google wrappers, serialize default values as empty wrappers. If turned off, default and null values are not serialized", 0) \
--- a/src/Core/SettingsChangesHistory.h
+++ b/src/Core/SettingsChangesHistory.h
@ -85,6 +85,9 @@ namespace SettingsChangesHistory
 /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972)
 static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> settings_changes_history =
 {
+    {"24.3", {
+                 {"input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects", false, false, "Allow to use String type for ambiguous paths during named tuple inference from JSON objects"},
+             }},
    {"24.2", {
              {"allow_suspicious_variant_types", true, false, "Don't allow creating Variant type with suspicious variants by default"},
              {"validate_experimental_and_suspicious_types_inside_nested_types", false, true, "Validate usage of experimental and suspicious types inside nested types"},
--- a/src/Formats/EscapingRuleUtils.cpp
+++ b/src/Formats/EscapingRuleUtils.cpp
@ -450,8 +450,10 @@ String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, Fo
            break;
        case FormatSettings::EscapingRule::JSON:
            result += fmt::format(
-                ", try_infer_numbers_from_strings={}, read_bools_as_numbers={}, read_bools_as_strings={}, read_objects_as_strings={}, read_numbers_as_strings={}, "
-                "read_arrays_as_strings={}, try_infer_objects_as_tuples={}, infer_incomplete_types_as_strings={}, try_infer_objects={}",
+                ", try_infer_numbers_from_strings={}, read_bools_as_numbers={}, read_bools_as_strings={}, read_objects_as_strings={}, "
+                "read_numbers_as_strings={}, "
+                "read_arrays_as_strings={}, try_infer_objects_as_tuples={}, infer_incomplete_types_as_strings={}, try_infer_objects={}, "
+                "use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects={}",
                settings.json.try_infer_numbers_from_strings,
                settings.json.read_bools_as_numbers,
                settings.json.read_bools_as_strings,
@ -460,7 +462,8 @@ String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, Fo
                settings.json.read_arrays_as_strings,
                settings.json.try_infer_objects_as_tuples,
                settings.json.infer_incomplete_types_as_strings,
-                settings.json.allow_object_type);
+                settings.json.allow_object_type,
+                settings.json.use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects);
            break;
        default:
            break;
--- a/src/Formats/FormatFactory.cpp
+++ b/src/Formats/FormatFactory.cpp
@ -105,6 +105,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
    format_settings.json.write_named_tuples_as_objects = settings.output_format_json_named_tuples_as_objects;
    format_settings.json.skip_null_value_in_named_tuples = settings.output_format_json_skip_null_value_in_named_tuples;
    format_settings.json.read_named_tuples_as_objects = settings.input_format_json_named_tuples_as_objects;
+    format_settings.json.use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects = settings.input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects;
    format_settings.json.defaults_for_missing_elements_in_named_tuple = settings.input_format_json_defaults_for_missing_elements_in_named_tuple;
    format_settings.json.ignore_unknown_keys_in_named_tuple = settings.input_format_json_ignore_unknown_keys_in_named_tuple;
    format_settings.json.quote_64bit_integers = settings.output_format_json_quote_64bit_integers;
--- a/src/Formats/FormatSettings.h
+++ b/src/Formats/FormatSettings.h
@ -202,6 +202,7 @@ struct FormatSettings
        bool quote_decimals = false;
        bool escape_forward_slashes = true;
        bool read_named_tuples_as_objects = false;
+        bool use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects = false;
        bool write_named_tuples_as_objects = false;
        bool skip_null_value_in_named_tuples = false;
        bool defaults_for_missing_elements_in_named_tuple = false;
--- a/src/Formats/SchemaInferenceUtils.cpp
+++ b/src/Formats/SchemaInferenceUtils.cpp
@ -136,7 +136,7 @@ namespace

        bool empty() const { return paths.empty(); }

-        DataTypePtr finalize() const
+        DataTypePtr finalize(bool use_string_type_for_ambiguous_paths = false) const
        {
            if (paths.empty())
                throw Exception(ErrorCodes::ONLY_NULLS_WHILE_READING_SCHEMA, "Cannot infer named Tuple from JSON object because object is empty");
@ -167,7 +167,7 @@ namespace
                current_node->leaf_type = type;
            }

-            return root_node.getType();
+            return root_node.getType(use_string_type_for_ambiguous_paths);
        }

    private:
@ -180,7 +180,7 @@ namespace
            /// Store path to this node for better exception message in case of ambiguous paths.
            String path;

-            DataTypePtr getType() const
+            DataTypePtr getType(bool use_string_type_for_ambiguous_paths) const
            {
                /// Check if we have ambiguous paths.
                /// For example:
@ -191,7 +191,16 @@ namespace
                /// And after merge we will have ambiguous paths 'a.b.c' : Int32 and 'a.b' : Nullable(Nothing),
                /// but it's a valid case and we should ignore path 'a.b'.
                if (leaf_type && !isNothing(removeNullable(leaf_type)) && !nodes.empty())
-                    throw Exception(ErrorCodes::INCORRECT_DATA, "JSON objects have ambiguous paths: '{}' with type {} and '{}'", path, leaf_type->getName(), nodes.begin()->second.path);
+                {
+                    if (use_string_type_for_ambiguous_paths)
+                        return std::make_shared<DataTypeString>();
+                    throw Exception(
+                        ErrorCodes::INCORRECT_DATA,
+                        "JSON objects have ambiguous paths: '{}' (with type {}) and '{}'. You can enable setting "
+                        "input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects to use String type "
+                        "for path '{}'",
+                        path, leaf_type->getName(), nodes.begin()->second.path, path);
+                }

                if (nodes.empty())
                    return leaf_type;
@ -203,7 +212,7 @@ namespace
                for (const auto & [name, node] : nodes)
                {
                    node_names.push_back(name);
-                    node_types.push_back(node.getType());
+                    node_types.push_back(node.getType(use_string_type_for_ambiguous_paths));
                }

                return std::make_shared<DataTypeTuple>(std::move(node_types), std::move(node_names));
@ -866,13 +875,15 @@ namespace
        return std::make_shared<DataTypeTuple>(nested_types);
    }

+    template <bool is_json>
    bool tryReadFloat(Float64 & value, ReadBuffer & buf, const FormatSettings & settings)
    {
-        if (settings.try_infer_exponent_floats)
+        if (is_json || settings.try_infer_exponent_floats)
            return tryReadFloatText(value, buf);
        return tryReadFloatTextNoExponent(value, buf);
    }

+    template <bool is_json>
    DataTypePtr tryInferNumber(ReadBuffer & buf, const FormatSettings & settings)
    {
        if (buf.eof())
@ -911,7 +922,7 @@ namespace
                    buf.position() = number_start;
                }

-                if (tryReadFloat(tmp_float, buf, settings))
+                if (tryReadFloat<is_json>(tmp_float, buf, settings))
                {
                    if (read_int && buf.position() == int_end)
                        return std::make_shared<DataTypeInt64>();
@ -945,7 +956,7 @@ namespace
                peekable_buf.rollbackToCheckpoint(true);
            }

-            if (tryReadFloat(tmp_float, peekable_buf, settings))
+            if (tryReadFloat<is_json>(tmp_float, peekable_buf, settings))
            {
                /// Float parsing reads no fewer bytes than integer parsing,
                /// so position of the buffer is either the same, or further.
@ -957,7 +968,7 @@ namespace
                return std::make_shared<DataTypeFloat64>();
            }
        }
-        else if (tryReadFloat(tmp_float, buf, settings))
+        else if (tryReadFloat<is_json>(tmp_float, buf, settings))
        {
            return std::make_shared<DataTypeFloat64>();
        }
@ -966,6 +977,36 @@ namespace
        return nullptr;
    }

+    template <bool is_json>
+    DataTypePtr tryInferNumberFromStringImpl(std::string_view field, const FormatSettings & settings)
+    {
+        ReadBufferFromString buf(field);
+
+        if (settings.try_infer_integers)
+        {
+            Int64 tmp_int;
+            if (tryReadIntText(tmp_int, buf) && buf.eof())
+                return std::make_shared<DataTypeInt64>();
+
+            /// We can safely get back to the start of buffer, because we read from a string and we didn't reach eof.
+            buf.position() = buf.buffer().begin();
+
+            /// In case of Int64 overflow, try to infer UInt64
+            UInt64 tmp_uint;
+            if (tryReadIntText(tmp_uint, buf) && buf.eof())
+                return std::make_shared<DataTypeUInt64>();
+        }
+
+        /// We can safely get back to the start of buffer, because we read from a string and we didn't reach eof.
+        buf.position() = buf.buffer().begin();
+
+        Float64 tmp;
+        if (tryReadFloat<is_json>(tmp, buf, settings) && buf.eof())
+            return std::make_shared<DataTypeFloat64>();
+
+        return nullptr;
+    }
+
    template <bool is_json>
    DataTypePtr tryInferString(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info)
    {
@ -995,7 +1036,7 @@ namespace
        {
            if (settings.json.try_infer_numbers_from_strings)
            {
-                if (auto number_type = tryInferNumberFromString(field, settings))
+                if (auto number_type = tryInferNumberFromStringImpl<true>(field, settings))
                {
                    json_info->numbers_parsed_from_json_strings.insert(number_type.get());
                    return number_type;
@ -1238,7 +1279,7 @@ namespace
        }

        /// Number
-        return tryInferNumber(buf, settings);
+        return tryInferNumber<is_json>(buf, settings);
    }
 }

@ -1294,7 +1335,7 @@ void transformFinalInferredJSONTypeIfNeededImpl(DataTypePtr & data_type, const F
            return;
        }

-        data_type = json_paths->finalize();
+        data_type = json_paths->finalize(settings.json.use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects);
        transformFinalInferredJSONTypeIfNeededImpl(data_type, settings, json_info, remain_nothing_types);
        return;
    }
@ -1377,31 +1418,7 @@ void transformFinalInferredJSONTypeIfNeeded(DataTypePtr & data_type, const Forma

 DataTypePtr tryInferNumberFromString(std::string_view field, const FormatSettings & settings)
 {
-    ReadBufferFromString buf(field);
-
-    if (settings.try_infer_integers)
-    {
-        Int64 tmp_int;
-        if (tryReadIntText(tmp_int, buf) && buf.eof())
-            return std::make_shared<DataTypeInt64>();
-
-        /// We can safely get back to the start of buffer, because we read from a string and we didn't reach eof.
-        buf.position() = buf.buffer().begin();
-
-        /// In case of Int64 overflow, try to infer UInt64
-        UInt64 tmp_uint;
-        if (tryReadIntText(tmp_uint, buf) && buf.eof())
-            return std::make_shared<DataTypeUInt64>();
-    }
-
-    /// We can safely get back to the start of buffer, because we read from a string and we didn't reach eof.
-    buf.position() = buf.buffer().begin();
-
-    Float64 tmp;
-    if (tryReadFloat(tmp, buf, settings) && buf.eof())
-        return std::make_shared<DataTypeFloat64>();
-
-    return nullptr;
+    return tryInferNumberFromStringImpl<false>(field, settings);
 }

 DataTypePtr tryInferDateOrDateTimeFromString(std::string_view field, const FormatSettings & settings)
--- a/tests/queries/0_stateless/02982_dont_infer_exponent_floats.reference
+++ b/tests/queries/0_stateless/02982_dont_infer_exponent_floats.reference
@ -1,2 +1,3 @@
 c1	Nullable(String)					
 c1	Nullable(Float64)					
+x	Nullable(Float64)					
--- a/tests/queries/0_stateless/02982_dont_infer_exponent_floats.sql
+++ b/tests/queries/0_stateless/02982_dont_infer_exponent_floats.sql
@ -1,2 +1,5 @@
 DESC format(CSV, '1E20\n1.1E20') settings input_format_try_infer_exponent_floats = 0;
 DESC format(CSV, '1E20\n1.1E20') settings input_format_try_infer_exponent_floats = 1;
+-- This setting should not take affect on JSON formats
+DESC format(JSONEachRow, '{"x" : 1.1e20}') settings input_format_try_infer_exponent_floats = 0;
+
--- a/tests/queries/0_stateless/03004_json_named_tuples_inference_ambiguous_paths_as_string.reference
+++ b/tests/queries/0_stateless/03004_json_named_tuples_inference_ambiguous_paths_as_string.reference
@ -0,0 +1,3 @@
+obj	Tuple(\n    a Nullable(String))					
+('42')
+('{"b" : 42}')
--- a/tests/queries/0_stateless/03004_json_named_tuples_inference_ambiguous_paths_as_string.sql
+++ b/tests/queries/0_stateless/03004_json_named_tuples_inference_ambiguous_paths_as_string.sql
@ -0,0 +1,4 @@
+set input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects=1;
+desc format(JSONEachRow, '{"obj" : {"a" : 42}}, {"obj" : {"a" : {"b" : 42}}}');
+select * from format(JSONEachRow, '{"obj" : {"a" : 42}}, {"obj" : {"a" : {"b" : 42}}}');
+