Merge pull request #44052 from Avogar/object-as-string

Allow to parse JSON objects into Strings in JSON input formats
This commit is contained in:
Kruglov Pavel 2022-12-09 15:12:25 +01:00 committed by GitHub
commit 59201c71cb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 67 additions and 2 deletions

View File

@ -1202,6 +1202,7 @@ SELECT * FROM json_each_row_nested
- [input_format_import_nested_json](../operations/settings/settings.md#input_format_import_nested_json) - map nested JSON data to nested tables (it works for JSONEachRow format). Default value - `false`.
- [input_format_json_read_bools_as_numbers](../operations/settings/settings.md#input_format_json_read_bools_as_numbers) - allow to parse bools as numbers in JSON input formats. Default value - `true`.
- [input_format_json_read_numbers_as_strings](../operations/settings/settings.md#input_format_json_read_numbers_as_strings) - allow to parse numbers as strings in JSON input formats. Default value - `false`.
- [input_format_json_read_objects_as_strings](../operations/settings/settings.md#input_format_json_read_objects_as_strings) - allow to parse JSON objects as strings in JSON input formats. Default value - `false`.
- [output_format_json_quote_64bit_integers](../operations/settings/settings.md#output_format_json_quote_64bit_integers) - controls quoting of 64-bit integers in JSON output format. Default value - `true`.
- [output_format_json_quote_64bit_floats](../operations/settings/settings.md#output_format_json_quote_64bit_floats) - controls quoting of 64-bit floats in JSON output format. Default value - `false`.
- [output_format_json_quote_denormals](../operations/settings/settings.md#output_format_json_quote_denormals) - enables '+nan', '-nan', '+inf', '-inf' outputs in JSON output format. Default value - `false`.

View File

@ -3753,6 +3753,29 @@ Allow parsing numbers as strings in JSON input formats.
Disabled by default.
### input_format_json_read_objects_as_strings {#input_format_json_read_objects_as_strings}
Allow parsing JSON objects as strings in JSON input formats.
Example:
```sql
SET input_format_json_read_objects_as_strings = 1;
CREATE TABLE test (id UInt64, obj String, date Date) ENGINE=Memory();
INSERT INTO test FORMAT JSONEachRow {"id" : 1, "obj" : {"a" : 1, "b" : "Hello"}, "date" : "2020-01-01"};
SELECT * FROM test;
```
Result:
```
┌─id─┬─obj──────────────────────┬───────date─┐
│ 1 │ {"a" : 1, "b" : "Hello"} │ 2020-01-01 │
└────┴──────────────────────────┴────────────┘
```
Disabled by default.
### input_format_json_validate_types_from_metadata {#input_format_json_validate_types_from_metadata}
For JSON/JSONCompact/JSONColumnsWithMetadata input formats, if this setting is set to 1,

View File

@ -764,6 +764,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
M(Bool, input_format_json_try_infer_numbers_from_strings, true, "Try to infer numbers from string fields while schema inference", 0) \
M(Bool, input_format_json_validate_types_from_metadata, true, "For JSON/JSONCompact/JSONColumnsWithMetadata input formats this controls whether format parser should check if data types from input metadata match data types of the corresponding columns from the table", 0) \
M(Bool, input_format_json_read_numbers_as_strings, false, "Allow to parse numbers as strings in JSON input formats", 0) \
M(Bool, input_format_json_read_objects_as_strings, false, "Allow to parse JSON objects as strings in JSON input formats", 0) \
M(Bool, input_format_try_infer_integers, true, "Try to infer numbers from string fields while schema inference in text formats", 0) \
M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \
M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \

View File

@ -311,13 +311,20 @@ void SerializationString::serializeTextJSON(const IColumn & column, size_t row_n
void SerializationString::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
if (settings.json.read_numbers_as_strings && !istr.eof() && *istr.position() != '"')
if (settings.json.read_objects_as_strings && !istr.eof() && *istr.position() == '{')
{
String field;
readJSONObjectPossiblyInvalid(field, istr);
ReadBufferFromString buf(field);
read(column, [&](ColumnString::Chars & data) { data.insert(field.begin(), field.end()); });
}
else if (settings.json.read_numbers_as_strings && !istr.eof() && *istr.position() != '"')
{
String field;
readJSONField(field, istr);
Float64 tmp;
ReadBufferFromString buf(field);
if (tryReadFloatText(tmp, buf))
if (tryReadFloatText(tmp, buf) && buf.eof())
read(column, [&](ColumnString::Chars & data) { data.insert(field.begin(), field.end()); });
else
throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON String value here: {}", field);

View File

@ -394,6 +394,7 @@ void transformInferredTypesIfNeededImpl(DataTypes & types, const FormatSettings
bool have_maps = false;
bool have_objects = false;
bool have_strings = false;
bool are_maps_equal = true;
DataTypePtr first_map_type;
for (const auto & type : data_types)
@ -414,6 +415,10 @@ void transformInferredTypesIfNeededImpl(DataTypes & types, const FormatSettings
{
have_objects = true;
}
else if (isString(type))
{
have_strings = false;
}
}
if (have_maps && (have_objects || !are_maps_equal))
@ -424,6 +429,15 @@ void transformInferredTypesIfNeededImpl(DataTypes & types, const FormatSettings
type = std::make_shared<DataTypeObject>("json", true);
}
}
if (settings.json.read_objects_as_strings && have_strings && (have_maps || have_objects))
{
for (auto & type : data_types)
{
if (isMap(type) || isObject(type))
type = std::make_shared<DataTypeString>();
}
}
};
transformTypesRecursively(types, transform_simple_types, transform_complex_types);

View File

@ -98,6 +98,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.json.quote_decimals = settings.output_format_json_quote_decimals;
format_settings.json.read_bools_as_numbers = settings.input_format_json_read_bools_as_numbers;
format_settings.json.read_numbers_as_strings = settings.input_format_json_read_numbers_as_strings;
format_settings.json.read_objects_as_strings = settings.input_format_json_read_objects_as_strings;
format_settings.json.try_infer_numbers_from_strings = settings.input_format_json_try_infer_numbers_from_strings;
format_settings.json.validate_types_from_metadata = settings.input_format_json_validate_types_from_metadata;
format_settings.json.validate_utf8 = settings.output_format_json_validate_utf8;

View File

@ -152,6 +152,7 @@ struct FormatSettings
bool serialize_as_strings = false;
bool read_bools_as_numbers = true;
bool read_numbers_as_strings = true;
bool read_objects_as_strings = true;
bool try_infer_numbers_from_strings = false;
bool validate_types_from_metadata = true;
bool validate_utf8 = false;

View File

@ -235,7 +235,12 @@ namespace JSONUtils
/// If we couldn't infer nested type and Object type is not enabled,
/// we can't determine the type of this JSON field.
if (!settings.json.try_infer_objects)
{
/// If read_objects_as_strings is enabled, we can read objects into strings.
if (settings.json.read_objects_as_strings)
return makeNullable(std::make_shared<DataTypeString>());
return nullptr;
}
continue;
}
@ -257,7 +262,12 @@ namespace JSONUtils
if (!are_types_equal)
{
if (!settings.json.try_infer_objects)
{
/// If read_objects_as_strings is enabled, we can read objects into strings.
if (settings.json.read_objects_as_strings)
return makeNullable(std::make_shared<DataTypeString>());
return nullptr;
}
return std::make_shared<DataTypeObject>("json", true);
}

View File

@ -0,0 +1,3 @@
x Nullable(String)
abc
{"a" : 10, "b" : "abc"}

View File

@ -0,0 +1,4 @@
-- Tags: no-fasttest
set input_format_json_read_objects_as_strings=1;
desc format(JSONEachRow, '{"x" : "abc"}, {"x" : {"a" : 10, "b" : "abc"}}');
select * from format(JSONEachRow, '{"x" : "abc"}, {"x" : {"a" : 10, "b" : "abc"}}');