From d0f9bb2ec2bac5f2f644847a90e03d4d13d56d9a Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 8 Dec 2022 18:58:18 +0000 Subject: [PATCH 1/4] Allow to parse JSON objects into Strings --- docs/en/interfaces/formats.md | 1 + docs/en/operations/settings/settings.md | 22 +++++++++++++++++++ src/Core/Settings.h | 3 ++- .../Serializations/SerializationString.cpp | 11 ++++++++-- src/Formats/EscapingRuleUtils.cpp | 14 ++++++++++++ src/Formats/FormatFactory.cpp | 1 + src/Formats/FormatSettings.h | 1 + src/Formats/JSONUtils.cpp | 10 +++++++++ ...499_read_json_objects_as_strings.reference | 3 +++ .../02499_read_json_objects_as_strings.sql | 4 ++++ 10 files changed, 67 insertions(+), 3 deletions(-) create mode 100644 tests/queries/0_stateless/02499_read_json_objects_as_strings.reference create mode 100644 tests/queries/0_stateless/02499_read_json_objects_as_strings.sql diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 731348abfe7..58998a6f491 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -1202,6 +1202,7 @@ SELECT * FROM json_each_row_nested - [input_format_import_nested_json](../operations/settings/settings.md#input_format_import_nested_json) - map nested JSON data to nested tables (it works for JSONEachRow format). Default value - `false`. - [input_format_json_read_bools_as_numbers](../operations/settings/settings.md#input_format_json_read_bools_as_numbers) - allow to parse bools as numbers in JSON input formats. Default value - `true`. - [input_format_json_read_numbers_as_strings](../operations/settings/settings.md#input_format_json_read_numbers_as_strings) - allow to parse numbers as strings in JSON input formats. Default value - `false`. +- [input_format_json_read_objects_as_strings](../operations/settings/settings.md#input_format_json_read_objects_as_strings) - allow to parse JSON objects as strings in JSON input formats. Default value - `false`. - [output_format_json_quote_64bit_integers](../operations/settings/settings.md#output_format_json_quote_64bit_integers) - controls quoting of 64-bit integers in JSON output format. Default value - `true`. - [output_format_json_quote_64bit_floats](../operations/settings/settings.md#output_format_json_quote_64bit_floats) - controls quoting of 64-bit floats in JSON output format. Default value - `false`. - [output_format_json_quote_denormals](../operations/settings/settings.md#output_format_json_quote_denormals) - enables '+nan', '-nan', '+inf', '-inf' outputs in JSON output format. Default value - `false`. diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 2fc6e64b7eb..d8c08b6c078 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -3753,6 +3753,28 @@ Allow parsing numbers as strings in JSON input formats. Disabled by default. +### input_format_json_read_objects_as_strings {#input_format_json_read_objects_as_strings} + +Allow parsing JSON objects as strings in JSON input formats. + +Example: + +```sql +CREATE TABLE test (obj String) ENGINE=Memory(); +INSERT INTO test FORMAT JSONEachRow {"obj" : {"a" : 1, "b" : "Hello"}}; +SELECT * FROM test; +``` + +Result: + +``` +┌─obj──────────────────────┐ +│ {"a" : 1, "b" : "Hello"} │ +└──────────────────────────┘ +``` + +Disabled by default. + ### input_format_json_validate_types_from_metadata {#input_format_json_validate_types_from_metadata} For JSON/JSONCompact/JSONColumnsWithMetadata input formats, if this setting is set to 1, diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 33af6710999..1f112f9ad3b 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -546,7 +546,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) \ M(Bool, collect_hash_table_stats_during_aggregation, true, "Enable collecting hash table statistics to optimize memory allocation", 0) \ M(UInt64, max_entries_for_hash_table_stats, 10'000, "How many entries hash table statistics collected during aggregation is allowed to have", 0) \ - M(UInt64, max_size_to_preallocate_for_aggregation, 10'000'000, "For how many elements it is allowed to preallocate space in all hash tables in total before aggregation", 0) \ + M(UInt64, max_size_to_preallocate_for_aggregation, 100'000'000, "For how many elements it is allowed to preallocate space in all hash tables in total before aggregation", 0) \ \ M(Bool, kafka_disable_num_consumers_limit, false, "Disable limit on kafka_num_consumers that depends on the number of available CPU cores", 0) \ M(Bool, enable_software_prefetch_in_aggregation, true, "Enable use of software prefetch in aggregation", 0) \ @@ -763,6 +763,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(Bool, input_format_json_try_infer_numbers_from_strings, true, "Try to infer numbers from string fields while schema inference", 0) \ M(Bool, input_format_json_validate_types_from_metadata, true, "For JSON/JSONCompact/JSONColumnsWithMetadata input formats this controls whether format parser should check if data types from input metadata match data types of the corresponding columns from the table", 0) \ M(Bool, input_format_json_read_numbers_as_strings, false, "Allow to parse numbers as strings in JSON input formats", 0) \ + M(Bool, input_format_json_read_objects_as_strings, false, "Allow to parse JSON objects as strings in JSON input formats", 0) \ M(Bool, input_format_try_infer_integers, true, "Try to infer numbers from string fields while schema inference in text formats", 0) \ M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \ M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \ diff --git a/src/DataTypes/Serializations/SerializationString.cpp b/src/DataTypes/Serializations/SerializationString.cpp index c314ed49973..96608fbbc04 100644 --- a/src/DataTypes/Serializations/SerializationString.cpp +++ b/src/DataTypes/Serializations/SerializationString.cpp @@ -311,13 +311,20 @@ void SerializationString::serializeTextJSON(const IColumn & column, size_t row_n void SerializationString::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - if (settings.json.read_numbers_as_strings && !istr.eof() && *istr.position() != '"') + if (settings.json.read_objects_as_strings && !istr.eof() && *istr.position() == '{') + { + String field; + readJSONObjectPossiblyInvalid(field, istr); + ReadBufferFromString buf(field); + read(column, [&](ColumnString::Chars & data) { data.insert(field.begin(), field.end()); }); + } + else if (settings.json.read_numbers_as_strings && !istr.eof() && *istr.position() != '"') { String field; readJSONField(field, istr); Float64 tmp; ReadBufferFromString buf(field); - if (tryReadFloatText(tmp, buf)) + if (tryReadFloatText(tmp, buf) && buf.eof()) read(column, [&](ColumnString::Chars & data) { data.insert(field.begin(), field.end()); }); else throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON String value here: {}", field); diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index 2414d8c5ba4..12942bcd13c 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -394,6 +394,7 @@ void transformInferredTypesIfNeededImpl(DataTypes & types, const FormatSettings bool have_maps = false; bool have_objects = false; + bool have_strings = false; bool are_maps_equal = true; DataTypePtr first_map_type; for (const auto & type : data_types) @@ -414,6 +415,10 @@ void transformInferredTypesIfNeededImpl(DataTypes & types, const FormatSettings { have_objects = true; } + else if (isString(type)) + { + have_strings = false; + } } if (have_maps && (have_objects || !are_maps_equal)) @@ -424,6 +429,15 @@ void transformInferredTypesIfNeededImpl(DataTypes & types, const FormatSettings type = std::make_shared("json", true); } } + + if (settings.json.read_objects_as_strings && have_strings && (have_maps || have_objects)) + { + for (auto & type : data_types) + { + if (isMap(type) || isObject(type)) + type = std::make_shared(); + } + } }; transformTypesRecursively(types, transform_simple_types, transform_complex_types); diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index fe84d780714..0fd9109b0d7 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -98,6 +98,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.json.quote_decimals = settings.output_format_json_quote_decimals; format_settings.json.read_bools_as_numbers = settings.input_format_json_read_bools_as_numbers; format_settings.json.read_numbers_as_strings = settings.input_format_json_read_numbers_as_strings; + format_settings.json.read_objects_as_strings = settings.input_format_json_read_objects_as_strings; format_settings.json.try_infer_numbers_from_strings = settings.input_format_json_try_infer_numbers_from_strings; format_settings.json.validate_types_from_metadata = settings.input_format_json_validate_types_from_metadata; format_settings.json.validate_utf8 = settings.output_format_json_validate_utf8; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index ad2f05a5819..98f9e486141 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -152,6 +152,7 @@ struct FormatSettings bool serialize_as_strings = false; bool read_bools_as_numbers = true; bool read_numbers_as_strings = true; + bool read_objects_as_strings = true; bool try_infer_numbers_from_strings = false; bool validate_types_from_metadata = true; bool validate_utf8 = false; diff --git a/src/Formats/JSONUtils.cpp b/src/Formats/JSONUtils.cpp index 926e3478ad8..d77cd14bd38 100644 --- a/src/Formats/JSONUtils.cpp +++ b/src/Formats/JSONUtils.cpp @@ -235,7 +235,12 @@ namespace JSONUtils /// If we couldn't infer nested type and Object type is not enabled, /// we can't determine the type of this JSON field. if (!settings.json.try_infer_objects) + { + /// If read_objects_as_strings is enabled, we can read objects into strings. + if (settings.json.read_objects_as_strings) + return makeNullable(std::make_shared()); return nullptr; + } continue; } @@ -257,7 +262,12 @@ namespace JSONUtils if (!are_types_equal) { if (!settings.json.try_infer_objects) + { + /// If read_objects_as_strings is enabled, we can read objects into strings. + if (settings.json.read_objects_as_strings) + return makeNullable(std::make_shared()); return nullptr; + } return std::make_shared("json", true); } diff --git a/tests/queries/0_stateless/02499_read_json_objects_as_strings.reference b/tests/queries/0_stateless/02499_read_json_objects_as_strings.reference new file mode 100644 index 00000000000..4042c1f4389 --- /dev/null +++ b/tests/queries/0_stateless/02499_read_json_objects_as_strings.reference @@ -0,0 +1,3 @@ +x Nullable(String) +abc +{"a" : 10, "b" : "abc"} diff --git a/tests/queries/0_stateless/02499_read_json_objects_as_strings.sql b/tests/queries/0_stateless/02499_read_json_objects_as_strings.sql new file mode 100644 index 00000000000..277dd9578b3 --- /dev/null +++ b/tests/queries/0_stateless/02499_read_json_objects_as_strings.sql @@ -0,0 +1,4 @@ +set input_format_json_read_objects_as_strings=1; +desc format(JSONEachRow, '{"x" : "abc"}, {"x" : {"a" : 10, "b" : "abc"}}'); +select * from format(JSONEachRow, '{"x" : "abc"}, {"x" : {"a" : 10, "b" : "abc"}}'); + From e2000d1e59b8329211b62a7c0b20f18abb5e064a Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 8 Dec 2022 20:15:51 +0100 Subject: [PATCH 2/4] Fix typo --- src/Core/Settings.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 1f112f9ad3b..540e21372cb 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -546,7 +546,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) \ M(Bool, collect_hash_table_stats_during_aggregation, true, "Enable collecting hash table statistics to optimize memory allocation", 0) \ M(UInt64, max_entries_for_hash_table_stats, 10'000, "How many entries hash table statistics collected during aggregation is allowed to have", 0) \ - M(UInt64, max_size_to_preallocate_for_aggregation, 100'000'000, "For how many elements it is allowed to preallocate space in all hash tables in total before aggregation", 0) \ + M(UInt64, max_size_to_preallocate_for_aggregation, 10'000'000, "For how many elements it is allowed to preallocate space in all hash tables in total before aggregation", 0) \ \ M(Bool, kafka_disable_num_consumers_limit, false, "Disable limit on kafka_num_consumers that depends on the number of available CPU cores", 0) \ M(Bool, enable_software_prefetch_in_aggregation, true, "Enable use of software prefetch in aggregation", 0) \ From 01e0d8d02ca28930d454f928cc7109f0c19e6862 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 8 Dec 2022 20:52:31 +0100 Subject: [PATCH 3/4] Fix fasttest --- .../queries/0_stateless/02499_read_json_objects_as_strings.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02499_read_json_objects_as_strings.sql b/tests/queries/0_stateless/02499_read_json_objects_as_strings.sql index 277dd9578b3..bb4c9e9da0f 100644 --- a/tests/queries/0_stateless/02499_read_json_objects_as_strings.sql +++ b/tests/queries/0_stateless/02499_read_json_objects_as_strings.sql @@ -1,4 +1,4 @@ +-- Tags: no-fasttest set input_format_json_read_objects_as_strings=1; desc format(JSONEachRow, '{"x" : "abc"}, {"x" : {"a" : 10, "b" : "abc"}}'); select * from format(JSONEachRow, '{"x" : "abc"}, {"x" : {"a" : 10, "b" : "abc"}}'); - From 0d17774440dfd2d7ae7cab72fda4ff39c36888ae Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 8 Dec 2022 20:57:28 +0100 Subject: [PATCH 4/4] Better example in docs --- docs/en/operations/settings/settings.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index d8c08b6c078..ddfaab02159 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -3760,17 +3760,18 @@ Allow parsing JSON objects as strings in JSON input formats. Example: ```sql -CREATE TABLE test (obj String) ENGINE=Memory(); -INSERT INTO test FORMAT JSONEachRow {"obj" : {"a" : 1, "b" : "Hello"}}; +SET input_format_json_read_objects_as_strings = 1; +CREATE TABLE test (id UInt64, obj String, date Date) ENGINE=Memory(); +INSERT INTO test FORMAT JSONEachRow {"id" : 1, "obj" : {"a" : 1, "b" : "Hello"}, "date" : "2020-01-01"}; SELECT * FROM test; ``` Result: ``` -┌─obj──────────────────────┐ -│ {"a" : 1, "b" : "Hello"} │ -└──────────────────────────┘ +┌─id─┬─obj──────────────────────┬───────date─┐ +│ 1 │ {"a" : 1, "b" : "Hello"} │ 2020-01-01 │ +└────┴──────────────────────────┴────────────┘ ``` Disabled by default.