mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-10 09:32:06 +00:00
Better schema inference for JSON formats
This commit is contained in:
parent
acadda1665
commit
42ca897f2d
@ -1259,9 +1259,12 @@ SELECT * FROM json_each_row_nested
|
||||
|
||||
- [input_format_import_nested_json](/docs/en/operations/settings/settings-formats.md/#input_format_import_nested_json) - map nested JSON data to nested tables (it works for JSONEachRow format). Default value - `false`.
|
||||
- [input_format_json_read_bools_as_numbers](/docs/en/operations/settings/settings-formats.md/#input_format_json_read_bools_as_numbers) - allow to parse bools as numbers in JSON input formats. Default value - `true`.
|
||||
- [input_format_json_read_numbers_as_strings](/docs/en/operations/settings/settings-formats.md/#input_format_json_read_numbers_as_strings) - allow to parse numbers as strings in JSON input formats. Default value - `false`.
|
||||
- [input_format_json_read_objects_as_strings](/docs/en/operations/settings/settings-formats.md/#input_format_json_read_objects_as_strings) - allow to parse JSON objects as strings in JSON input formats. Default value - `false`.
|
||||
- [input_format_json_read_numbers_as_strings](/docs/en/operations/settings/settings-formats.md/#input_format_json_read_numbers_as_strings) - allow to parse numbers as strings in JSON input formats. Default value - `trye`.
|
||||
- [input_format_json_read_arrays_as_strings](/docs/en/operations/settings/settings-formats.md/#input_format_json_read_arrays_as_strings) - allow to parse JSON arrays as strings in JSON input formats. Default value - `true`.
|
||||
- [input_format_json_read_objects_as_strings](/docs/en/operations/settings/settings-formats.md/#input_format_json_read_objects_as_strings) - allow to parse JSON objects as strings in JSON input formats. Default value - `true`.
|
||||
- [input_format_json_named_tuples_as_objects](/docs/en/operations/settings/settings-formats.md/#input_format_json_named_tuples_as_objects) - parse named tuple columns as JSON objects. Default value - `true`.
|
||||
- [input_format_json_try_infer_named_tuples_from_objects](/docs/en/operations/settings/settings-formats.md/#input_format_json_try_infer_named_tuples_from_objects) - try to infer named tuple from JSON objects during schema inference. Default value - `true`.
|
||||
- [input_format_json_infer_incomplete_types_as_strings](/docs/en/operations/settings/settings-formats.md/#input_format_json_infer_incomplete_types_as_strings) - use type String for keys that contains only Nulls or empty objects/arrays during schema inference in JSON input formats. Default value - `true`.
|
||||
- [input_format_json_defaults_for_missing_elements_in_named_tuple](/docs/en/operations/settings/settings-formats.md/#input_format_json_defaults_for_missing_elements_in_named_tuple) - insert default values for missing elements in JSON object while parsing named tuple. Default value - `true`.
|
||||
- [input_format_json_ignore_unknown_keys_in_named_tuple](/docs/en/operations/settings/settings-formats.md/#input_format_json_ignore_unknown_keys_in_named_tuple) - Ignore unknown keys in json object for named tuples. Default value - `false`.
|
||||
- [input_format_json_compact_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_json_compact_allow_variable_number_of_columns) - allow variable number of columns in JSONCompact/JSONCompactEachRow format, ignore extra columns and use default values on missing columns. Default value - `false`.
|
||||
|
@ -389,9 +389,25 @@ DESC format(JSONEachRow, '{"arr" : [null, 42, null]}')
|
||||
└──────┴────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
||||
```
|
||||
|
||||
Tuples:
|
||||
Named tuples:
|
||||
|
||||
In JSON formats we treat Arrays with elements of different types as Tuples.
|
||||
When setting `input_format_json_try_infer_named_tuples_from_objects` is enabled, during schema inference ClickHouse will try to infer named Tuple from JSON objects.
|
||||
The resulting named Tuple will contain all elements from all corresponding JSON objects from sample data.
|
||||
|
||||
```sql
|
||||
SET input_format_json_try_infer_named_tuples_from_objects = 1;
|
||||
DESC format(JSONEachRow, '{"obj" : {"a" : 42, "b" : "Hello"}}, {"obj" : {"a" : 43, "c" : [1, 2, 3]}}, {"obj" : {"d" : {"e" : 42}}}')
|
||||
```
|
||||
|
||||
```response
|
||||
┌─name─┬─type───────────────────────────────────────────────────────────────────────────────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
|
||||
│ obj │ Tuple(a Nullable(Int64), b Nullable(String), c Array(Nullable(Int64)), d Tuple(e Nullable(Int64))) │ │ │ │ │ │
|
||||
└──────┴────────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
||||
```
|
||||
|
||||
Unnamed Tuples:
|
||||
|
||||
In JSON formats we treat Arrays with elements of different types as Unnamed Tuples.
|
||||
```sql
|
||||
DESC format(JSONEachRow, '{"tuple" : [1, "Hello, World!", [1, 2, 3]]}')
|
||||
```
|
||||
@ -418,7 +434,10 @@ DESC format(JSONEachRow, $$
|
||||
Maps:
|
||||
|
||||
In JSON we can read objects with values of the same type as Map type.
|
||||
Note: it will work only when settings `input_format_json_read_objects_as_strings` and `input_format_json_try_infer_named_tuples_from_objects` are disabled.
|
||||
|
||||
```sql
|
||||
SET input_format_json_read_objects_as_strings = 0, input_format_json_try_infer_named_tuples_from_objects = 0;
|
||||
DESC format(JSONEachRow, '{"map" : {"key1" : 42, "key2" : 24, "key3" : 4}}')
|
||||
```
|
||||
```response
|
||||
@ -448,14 +467,22 @@ Nested complex types:
|
||||
DESC format(JSONEachRow, '{"value" : [[[42, 24], []], {"key1" : 42, "key2" : 24}]}')
|
||||
```
|
||||
```response
|
||||
┌─name──┬─type───────────────────────────────────────────────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
|
||||
│ value │ Tuple(Array(Array(Nullable(Int64))), Map(String, Nullable(Int64))) │ │ │ │ │ │
|
||||
└───────┴────────────────────────────────────────────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
||||
┌─name──┬─type─────────────────────────────────────────────────────────────────────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
|
||||
│ value │ Tuple(Array(Array(Nullable(String))), Tuple(key1 Nullable(Int64), key2 Nullable(Int64))) │ │ │ │ │ │
|
||||
└───────┴──────────────────────────────────────────────────────────────────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
||||
```
|
||||
|
||||
If ClickHouse cannot determine the type, because the data contains only nulls, an exception will be thrown:
|
||||
If ClickHouse cannot determine the type for some key, because the data contains only nulls/empty objects/empty arrays, type `String` will be used if setting `input_format_json_infer_incomplete_types_as_strings` is enabled or an exception will be thrown otherwise:
|
||||
```sql
|
||||
DESC format(JSONEachRow, '{"arr" : [null, null]}')
|
||||
DESC format(JSONEachRow, '{"arr" : [null, null]}') SETTINGS input_format_json_infer_incomplete_types_as_strings = 1;
|
||||
```
|
||||
```response
|
||||
┌─name─┬─type────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
|
||||
│ arr │ Array(Nullable(String)) │ │ │ │ │ │
|
||||
└──────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
||||
```
|
||||
```sql
|
||||
DESC format(JSONEachRow, '{"arr" : [null, null]}') SETTINGS input_format_json_infer_incomplete_types_as_strings = 0;
|
||||
```
|
||||
```response
|
||||
Code: 652. DB::Exception: Received from localhost:9000. DB::Exception:
|
||||
@ -466,26 +493,6 @@ most likely this column contains only Nulls or empty Arrays/Maps.
|
||||
|
||||
#### JSON settings {#json-settings}
|
||||
|
||||
##### input_format_json_read_objects_as_strings
|
||||
|
||||
Enabling this setting allows reading nested JSON objects as strings.
|
||||
This setting can be used to read nested JSON objects without using JSON object type.
|
||||
|
||||
This setting is enabled by default.
|
||||
|
||||
```sql
|
||||
SET input_format_json_read_objects_as_strings = 1;
|
||||
DESC format(JSONEachRow, $$
|
||||
{"obj" : {"key1" : 42, "key2" : [1,2,3,4]}}
|
||||
{"obj" : {"key3" : {"nested_key" : 1}}}
|
||||
$$)
|
||||
```
|
||||
```response
|
||||
┌─name─┬─type─────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
|
||||
│ obj │ Nullable(String) │ │ │ │ │ │
|
||||
└──────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
||||
```
|
||||
|
||||
##### input_format_json_try_infer_numbers_from_strings
|
||||
|
||||
Enabling this setting allows inferring numbers from string values.
|
||||
@ -512,7 +519,7 @@ DESC format(JSONEachRow, $$
|
||||
Enabling this setting allows inferring named Tuples from JSON objects. The resulting named Tuple will contain all elements from all corresponding JSON objects from sample data.
|
||||
It can be useful when JSON data is not sparse so the sample of data will contain all possible object keys.
|
||||
|
||||
This setting is disabled by default;
|
||||
This setting is enabled by default.
|
||||
|
||||
**Example**
|
||||
|
||||
@ -542,11 +549,34 @@ Result:
|
||||
└───────┴─────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
||||
```
|
||||
|
||||
##### input_format_json_read_objects_as_strings
|
||||
|
||||
Enabling this setting allows reading nested JSON objects as strings.
|
||||
This setting can be used to read nested JSON objects without using JSON object type.
|
||||
|
||||
This setting is enabled by default.
|
||||
|
||||
Note: enabling this setting will take effect only if setting `input_format_json_try_infer_named_tuples_from_objects` is disabled.
|
||||
|
||||
```sql
|
||||
SET input_format_json_read_objects_as_strings = 1, input_format_json_try_infer_named_tuples_from_objects = 0;
|
||||
DESC format(JSONEachRow, $$
|
||||
{"obj" : {"key1" : 42, "key2" : [1,2,3,4]}}
|
||||
{"obj" : {"key3" : {"nested_key" : 1}}}
|
||||
$$)
|
||||
```
|
||||
```response
|
||||
┌─name─┬─type─────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
|
||||
│ obj │ Nullable(String) │ │ │ │ │ │
|
||||
└──────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
||||
```
|
||||
|
||||
|
||||
##### input_format_json_read_numbers_as_strings
|
||||
|
||||
Enabling this setting allows reading numeric values as strings.
|
||||
|
||||
This setting is disabled by default.
|
||||
This setting is enabled by default.
|
||||
|
||||
**Example**
|
||||
|
||||
@ -584,6 +614,49 @@ DESC format(JSONEachRow, $$
|
||||
└───────┴─────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
||||
```
|
||||
|
||||
##### input_format_json_read_arrays_as_strings
|
||||
|
||||
Enabling this setting allows reading JSON array values as strings.
|
||||
|
||||
This setting is enabled by default.
|
||||
|
||||
**Example**
|
||||
|
||||
```sql
|
||||
SET input_format_json_read_arrays_as_strings = 1;
|
||||
SELECT arr, toTypeName(arr), JSONExtractArrayRaw(arr)[3] from format(JSONEachRow, 'arr String', '{"arr" : [1, "Hello", [1,2,3]]}');
|
||||
```
|
||||
```response
|
||||
┌─arr───────────────────┬─toTypeName(arr)─┬─arrayElement(JSONExtractArrayRaw(arr), 3)─┐
|
||||
│ [1, "Hello", [1,2,3]] │ String │ [1,2,3] │
|
||||
└───────────────────────┴─────────────────┴───────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
##### input_format_json_infer_incomplete_types_as_strings
|
||||
|
||||
Enabling this setting allows to use String type for JSON keys that contain only `Null`/`{}`/`[]` in data sample during schema inference.
|
||||
In JSON formats any value can be read as String if all corresponding settings are enabled (they are all enabled by default), and we can avoid errors like `Cannot determine type for column 'column_name' by first 25000 rows of data, most likely this column contains only Nulls or empty Arrays/Maps` during schema inference
|
||||
by using String type for keys with unknown types.
|
||||
|
||||
Example:
|
||||
|
||||
```sql
|
||||
SET input_format_json_infer_incomplete_types_as_strings = 1, input_format_json_try_infer_named_tuples_from_objects = 1;
|
||||
DESCRIBE format(JSONEachRow, '{"obj" : {"a" : [1,2,3], "b" : "hello", "c" : null, "d" : {}, "e" : []}}');
|
||||
SELECT * FROM format(JSONEachRow, '{"obj" : {"a" : [1,2,3], "b" : "hello", "c" : null, "d" : {}, "e" : []}}');
|
||||
```
|
||||
|
||||
Result:
|
||||
```
|
||||
┌─name─┬─type───────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
|
||||
│ obj │ Tuple(a Array(Nullable(Int64)), b Nullable(String), c Nullable(String), d Nullable(String), e Array(Nullable(String))) │ │ │ │ │ │
|
||||
└──────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
||||
|
||||
┌─obj────────────────────────────┐
|
||||
│ ([1,2,3],'hello',NULL,'{}',[]) │
|
||||
└────────────────────────────────┘
|
||||
```
|
||||
|
||||
### CSV {#csv}
|
||||
|
||||
In CSV format ClickHouse extracts column values from the row according to delimiters. ClickHouse expects all types except numbers and strings to be enclosed in double quotes. If the value is in double quotes, ClickHouse tries to parse
|
||||
|
@ -381,7 +381,7 @@ Enabled by default.
|
||||
|
||||
Allow parsing numbers as strings in JSON input formats.
|
||||
|
||||
Disabled by default.
|
||||
Enabled by default.
|
||||
|
||||
## input_format_json_read_objects_as_strings {#input_format_json_read_objects_as_strings}
|
||||
|
||||
@ -426,7 +426,54 @@ Result:
|
||||
└──────┴────────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
||||
```
|
||||
|
||||
Disabled by default.
|
||||
Enabled by default.
|
||||
|
||||
## input_format_json_read_arrays_as_strings {#input_format_json_read_arrays_as_strings}
|
||||
|
||||
Allow parsing JSON arrays as strings in JSON input formats.
|
||||
|
||||
Example:
|
||||
|
||||
```sql
|
||||
SET input_format_json_read_arrays_as_strings = 1;
|
||||
SELECT arr, toTypeName(arr), JSONExtractArrayRaw(arr)[3] from format(JSONEachRow, 'arr String', '{"arr" : [1, "Hello", [1,2,3]]}');
|
||||
```
|
||||
|
||||
Result:
|
||||
```
|
||||
┌─arr───────────────────┬─toTypeName(arr)─┬─arrayElement(JSONExtractArrayRaw(arr), 3)─┐
|
||||
│ [1, "Hello", [1,2,3]] │ String │ [1,2,3] │
|
||||
└───────────────────────┴─────────────────┴───────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
Enabled by default.
|
||||
|
||||
## input_format_json_infer_incomplete_types_as_strings {#input_format_json_infer_incomplete_types_as_strings}
|
||||
|
||||
Allow to use String type for JSON keys that contain only `Null`/`{}`/`[]` in data sample during schema inference.
|
||||
In JSON formats any value can be read as String, and we can avoid errors like `Cannot determine type for column 'column_name' by first 25000 rows of data, most likely this column contains only Nulls or empty Arrays/Maps` during schema inference
|
||||
by using String type for keys with unknown types.
|
||||
|
||||
Example:
|
||||
|
||||
```sql
|
||||
SET input_format_json_infer_incomplete_types_as_strings = 1, input_format_json_try_infer_named_tuples_from_objects = 1;
|
||||
DESCRIBE format(JSONEachRow, '{"obj" : {"a" : [1,2,3], "b" : "hello", "c" : null, "d" : {}, "e" : []}}');
|
||||
SELECT * FROM format(JSONEachRow, '{"obj" : {"a" : [1,2,3], "b" : "hello", "c" : null, "d" : {}, "e" : []}}');
|
||||
```
|
||||
|
||||
Result:
|
||||
```
|
||||
┌─name─┬─type───────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
|
||||
│ obj │ Tuple(a Array(Nullable(Int64)), b Nullable(String), c Nullable(String), d Nullable(String), e Array(Nullable(String))) │ │ │ │ │ │
|
||||
└──────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
||||
|
||||
┌─obj────────────────────────────┐
|
||||
│ ([1,2,3],'hello',NULL,'{}',[]) │
|
||||
└────────────────────────────────┘
|
||||
```
|
||||
|
||||
Enabled by default.
|
||||
|
||||
## input_format_json_validate_types_from_metadata {#input_format_json_validate_types_from_metadata}
|
||||
|
||||
|
@ -925,9 +925,11 @@ class IColumn;
|
||||
M(Bool, input_format_json_read_bools_as_numbers, true, "Allow to parse bools as numbers in JSON input formats", 0) \
|
||||
M(Bool, input_format_json_try_infer_numbers_from_strings, true, "Try to infer numbers from string fields while schema inference", 0) \
|
||||
M(Bool, input_format_json_validate_types_from_metadata, true, "For JSON/JSONCompact/JSONColumnsWithMetadata input formats this controls whether format parser should check if data types from input metadata match data types of the corresponding columns from the table", 0) \
|
||||
M(Bool, input_format_json_read_numbers_as_strings, false, "Allow to parse numbers as strings in JSON input formats", 0) \
|
||||
M(Bool, input_format_json_read_numbers_as_strings, true, "Allow to parse numbers as strings in JSON input formats", 0) \
|
||||
M(Bool, input_format_json_read_objects_as_strings, true, "Allow to parse JSON objects as strings in JSON input formats", 0) \
|
||||
M(Bool, input_format_json_read_arrays_as_strings, true, "Allow to parse JSON arrays as strings in JSON input formats", 0) \
|
||||
M(Bool, input_format_json_try_infer_named_tuples_from_objects, true, "Try to infer named tuples from JSON objects in JSON input formats", 0) \
|
||||
M(Bool, input_format_json_infer_incomplete_types_as_strings, true, "Use type String for keys that contains only Nulls or empty objects/arrays during schema inference in JSON input formats", 0) \
|
||||
M(Bool, input_format_json_named_tuples_as_objects, true, "Deserialize named tuple columns as JSON objects", 0) \
|
||||
M(Bool, input_format_json_ignore_unknown_keys_in_named_tuple, true, "Ignore unknown keys in json object for named tuples", 0) \
|
||||
M(Bool, input_format_json_defaults_for_missing_elements_in_named_tuple, true, "Insert default value in named tuple element if it's missing in json object", 0) \
|
||||
|
@ -81,7 +81,10 @@ namespace SettingsChangesHistory
|
||||
static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> settings_changes_history =
|
||||
{
|
||||
{"23.9", {{"optimize_group_by_constant_keys", false, true, "Optimize group by constant keys by default"},
|
||||
{"input_format_json_try_infer_named_tuples_from_objects", false, true, "Try to infer named Tuples from JSON objects by default"}}},
|
||||
{"input_format_json_try_infer_named_tuples_from_objects", false, true, "Try to infer named Tuples from JSON objects by default"},
|
||||
{"input_format_json_read_numbers_as_strings", false, true, "Allow to read numbers as strings in JSON formats by default"},
|
||||
{"input_format_json_read_arrays_as_strings", false, true, "Allow to read arrays as strings in JSON formats by default"},
|
||||
{"input_format_json_infer_incomplete_types_as_strings", false, true, "Allow to infer incomplete types as Strings in JSON formats by default"}}},
|
||||
{"23.8", {{"rewrite_count_distinct_if_with_count_distinct_implementation", false, true, "Rewrite countDistinctIf with count_distinct_implementation configuration"}}},
|
||||
{"23.7", {{"function_sleep_max_microseconds_per_block", 0, 3000000, "In previous versions, the maximum sleep time of 3 seconds was applied only for `sleep`, but not for `sleepEachRow` function. In the new version, we introduce this setting. If you set compatibility with the previous versions, we will disable the limit altogether."}}},
|
||||
{"23.6", {{"http_send_timeout", 180, 30, "3 minutes seems crazy long. Note that this is timeout for a single network write call, not for the whole upload operation."},
|
||||
|
@ -48,6 +48,7 @@ enum class TypeIndex
|
||||
Object,
|
||||
IPv4,
|
||||
IPv6,
|
||||
JSONPaths,
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -328,6 +328,13 @@ void SerializationString::deserializeTextJSON(IColumn & column, ReadBuffer & ist
|
||||
ReadBufferFromString buf(field);
|
||||
read(column, [&](ColumnString::Chars & data) { data.insert(field.begin(), field.end()); });
|
||||
}
|
||||
else if (settings.json.read_arrays_as_strings && !istr.eof() && *istr.position() == '[')
|
||||
{
|
||||
String field;
|
||||
readJSONArrayInto(field, istr);
|
||||
ReadBufferFromString buf(field);
|
||||
read(column, [&](ColumnString::Chars & data) { data.insert(field.begin(), field.end()); });
|
||||
}
|
||||
else if (settings.json.read_numbers_as_strings && !istr.eof() && *istr.position() != '"')
|
||||
{
|
||||
String field;
|
||||
|
@ -450,11 +450,15 @@ String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, Fo
|
||||
break;
|
||||
case FormatSettings::EscapingRule::JSON:
|
||||
result += fmt::format(
|
||||
", try_infer_numbers_from_strings={}, read_bools_as_numbers={}, read_objects_as_strings={}, read_numbers_as_strings={}, try_infer_objects={}",
|
||||
", try_infer_numbers_from_strings={}, read_bools_as_numbers={}, read_objects_as_strings={}, read_numbers_as_strings={}, "
|
||||
"read_arrays_as_strings={}, try_infer_objects_as_tuples={}, infer_incomplete_types_as_strings={}, try_infer_objects={}",
|
||||
settings.json.try_infer_numbers_from_strings,
|
||||
settings.json.read_bools_as_numbers,
|
||||
settings.json.read_objects_as_strings,
|
||||
settings.json.read_numbers_as_strings,
|
||||
settings.json.read_arrays_as_strings,
|
||||
settings.json.try_infer_objects_as_tuples,
|
||||
settings.json.infer_incomplete_types_as_strings,
|
||||
settings.json.allow_object_type);
|
||||
break;
|
||||
default:
|
||||
|
@ -111,7 +111,9 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.json.read_bools_as_numbers = settings.input_format_json_read_bools_as_numbers;
|
||||
format_settings.json.read_numbers_as_strings = settings.input_format_json_read_numbers_as_strings;
|
||||
format_settings.json.read_objects_as_strings = settings.input_format_json_read_objects_as_strings;
|
||||
format_settings.json.read_arrays_as_strings = settings.input_format_json_read_arrays_as_strings;
|
||||
format_settings.json.try_infer_numbers_from_strings = settings.input_format_json_try_infer_numbers_from_strings;
|
||||
format_settings.json.infer_incomplete_types_as_strings = settings.input_format_json_infer_incomplete_types_as_strings;
|
||||
format_settings.json.validate_types_from_metadata = settings.input_format_json_validate_types_from_metadata;
|
||||
format_settings.json.validate_utf8 = settings.output_format_json_validate_utf8;
|
||||
format_settings.json_object_each_row.column_for_object_name = settings.format_json_object_each_row_column_for_object_name;
|
||||
|
@ -194,12 +194,15 @@ struct FormatSettings
|
||||
bool read_bools_as_numbers = true;
|
||||
bool read_numbers_as_strings = true;
|
||||
bool read_objects_as_strings = true;
|
||||
bool read_arrays_as_strings = true;
|
||||
bool try_infer_numbers_from_strings = false;
|
||||
bool validate_types_from_metadata = true;
|
||||
bool validate_utf8 = false;
|
||||
bool allow_object_type = false;
|
||||
bool compact_allow_variable_number_of_columns = false;
|
||||
bool try_infer_objects_as_tuples = false;
|
||||
bool infer_incomplete_types_as_strings = true;
|
||||
|
||||
} json;
|
||||
|
||||
struct
|
||||
|
@ -20,6 +20,7 @@
|
||||
|
||||
#include <Core/Block.h>
|
||||
#include <Common/assert_cast.h>
|
||||
#include <Common/SipHash.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -27,10 +28,201 @@ namespace DB
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int TOO_DEEP_RECURSION;
|
||||
extern const int NOT_IMPLEMENTED;
|
||||
extern const int TYPE_MISMATCH;
|
||||
extern const int INCORRECT_DATA;
|
||||
extern const int ONLY_NULLS_WHILE_READING_SCHEMA;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
/// Special data type that represents JSON object as a set of paths and their types.
|
||||
/// It supports merging two JSON objects and creating Named Tuple from itself.
|
||||
/// It's used only for schema inference of Named Tuples from JSON objects.
|
||||
/// Example:
|
||||
/// JSON objects:
|
||||
/// "obj1" : {"a" : {"b" : 1, "c" : {"d" : 'Hello'}}, "e" : "World"}
|
||||
/// "obj2" : {"a" : {"b" : 2, "f" : [1,2,3]}, "g" : {"h" : 42}}
|
||||
/// JSONPaths type for each object:
|
||||
/// obj1 : {'a.b' : Int64, 'a.c.d' : String, 'e' : String}
|
||||
/// obj2 : {'a.b' : Int64, 'a.f' : Array(Int64), 'g.h' : Int64}
|
||||
/// Merged JSONPaths type for obj1 and obj2:
|
||||
/// obj1 ⋃ obj2 : {'a.b' : Int64, 'a.c.d' : String, 'a.f' : Array(Int64), 'e' : String, 'g.h' : Int64}
|
||||
/// Result Named Tuple:
|
||||
/// Tuple(a Tuple(b Int64, c Tuple(d String), f Array(Int64)), e String, g Tuple(h Int64))
|
||||
class DataTypeJSONPaths : public IDataTypeDummy
|
||||
{
|
||||
public:
|
||||
|
||||
/// We create DataTypeJSONPaths on each row in input data, to
|
||||
/// compare and merge such types faster, we use hash map to
|
||||
/// store mapping path -> data_type. Path is a vector
|
||||
/// of path components, to use hash map we need a hash
|
||||
/// for std::vector<String>. We cannot just concatenate
|
||||
/// components with '.' and store it as a string,
|
||||
/// because components can also contain '.'
|
||||
struct PathHash
|
||||
{
|
||||
size_t operator()(const std::vector<String> & path) const
|
||||
{
|
||||
SipHash hash;
|
||||
hash.update(path.size());
|
||||
for (const auto & part : path)
|
||||
hash.update(part);
|
||||
return hash.get64();
|
||||
}
|
||||
};
|
||||
|
||||
using Paths = std::unordered_map<std::vector<String>, DataTypePtr, PathHash>;
|
||||
|
||||
DataTypeJSONPaths(Paths paths_) : paths(std::move(paths_))
|
||||
{
|
||||
}
|
||||
|
||||
DataTypeJSONPaths() = default;
|
||||
|
||||
const char * getFamilyName() const override { return "JSONPaths"; }
|
||||
String doGetName() const override { return finalize()->getName(); }
|
||||
TypeIndex getTypeId() const override { return TypeIndex::JSONPaths; }
|
||||
|
||||
String getSQLCompatibleName() const override
|
||||
{
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getSQLCompatibleName is not implemented for JSONObjectForInference type");
|
||||
}
|
||||
|
||||
bool isParametric() const override
|
||||
{
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method isParametric is not implemented for JSONObjectForInference type");
|
||||
}
|
||||
|
||||
bool equals(const IDataType & rhs) const override
|
||||
{
|
||||
if (this == &rhs)
|
||||
return true;
|
||||
|
||||
if (rhs.getTypeId() != getTypeId())
|
||||
return false;
|
||||
|
||||
const auto & rhs_paths = assert_cast<const DataTypeJSONPaths &>(rhs).paths;
|
||||
if (paths.size() != rhs_paths.size())
|
||||
return false;
|
||||
|
||||
for (const auto & [path, type] : paths)
|
||||
{
|
||||
auto it = rhs_paths.find(path);
|
||||
if (it == rhs_paths.end() || !it->second->equals(*type))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool merge(const DataTypeJSONPaths & rhs, std::function<void(DataTypePtr & type1, DataTypePtr & type2)> transform_types)
|
||||
{
|
||||
for (const auto & [rhs_path, rhs_type] : rhs.paths)
|
||||
{
|
||||
auto it = paths.find(rhs_path);
|
||||
if (it != paths.end())
|
||||
{
|
||||
auto & type = it->second;
|
||||
/// If types are different, try to apply provided transform function.
|
||||
if (!type->equals(*rhs_type))
|
||||
{
|
||||
auto rhs_type_copy = rhs_type;
|
||||
transform_types(type, rhs_type_copy);
|
||||
/// If types for different paths are different even after transform, we cannot merge these objects.
|
||||
if (!type->equals(*rhs_type_copy))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
paths[rhs_path] = rhs_type;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool empty() const { return paths.empty(); }
|
||||
|
||||
DataTypePtr finalize() const
|
||||
{
|
||||
if (paths.empty())
|
||||
throw Exception(ErrorCodes::ONLY_NULLS_WHILE_READING_SCHEMA, "Cannot infer named Tuple from JSON object because object is empty");
|
||||
|
||||
/// Construct a path tree from list of paths and their types and convert it to named Tuple.
|
||||
/// Example:
|
||||
/// Paths : {'a.b' : Int64, 'a.c.d' : String, 'e' : String, 'f.g' : Array(Int64), 'f.h' : String}
|
||||
/// Tree:
|
||||
/// ┌─ 'c' ─ 'd' (String)
|
||||
/// ┌─ 'a' ┴─ 'b' (Int64)
|
||||
/// root ─┼─ 'e' (String)
|
||||
/// └─ 'f' ┬─ 'g' (Array(Int64))
|
||||
/// └─ 'h' (String)
|
||||
/// Result Named Tuple:
|
||||
/// Tuple('a' Tuple('b' Int64, 'c' Tuple('d' String)), 'e' String, 'f' Tuple('g' Array(Int64), 'h' String))
|
||||
PathNode root_node;
|
||||
for (const auto & [path, type] : paths)
|
||||
{
|
||||
PathNode * current_node = &root_node;
|
||||
String current_path;
|
||||
for (const auto & name : path)
|
||||
{
|
||||
current_path += (current_path.empty() ? "" : ".") + name;
|
||||
current_node = ¤t_node->nodes[name];
|
||||
current_node->path = current_path;
|
||||
}
|
||||
|
||||
current_node->leaf_type = type;
|
||||
}
|
||||
|
||||
return root_node.getType();
|
||||
}
|
||||
|
||||
private:
|
||||
struct PathNode
|
||||
{
|
||||
/// Use just map to have result tuple with names in lexicographic order.
|
||||
/// No strong reason for it, made for consistency.
|
||||
std::map<String, PathNode> nodes;
|
||||
DataTypePtr leaf_type;
|
||||
/// Store path to this node for better exception message in case of ambiguous paths.
|
||||
String path;
|
||||
|
||||
DataTypePtr getType() const
|
||||
{
|
||||
/// Check if we have ambiguous paths.
|
||||
/// For example:
|
||||
/// 'a.b.c' : Int32 and 'a.b' : String
|
||||
/// Also check if leaf type is Nothing, because the next situation is possible:
|
||||
/// {"a" : {"b" : null}} -> 'a.b' : Nullable(Nothing)
|
||||
/// {"a" : {"b" : {"c" : 42}}} -> 'a.b.c' : Int32
|
||||
/// And after merge we will have ambiguous paths 'a.b.c' : Int32 and 'a.b' : Nullable(Nothing),
|
||||
/// but it's a valid case and we should ignore path 'a.b'.
|
||||
if (leaf_type && !isNothing(removeNullable(leaf_type)) && !nodes.empty())
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "JSON objects have ambiguous paths: '{}' with type {} and '{}'", path, leaf_type->getName(), nodes.begin()->second.path);
|
||||
|
||||
if (nodes.empty())
|
||||
return leaf_type;
|
||||
|
||||
Names node_names;
|
||||
node_names.reserve(nodes.size());
|
||||
DataTypes node_types;
|
||||
node_types.reserve(nodes.size());
|
||||
for (const auto & [name, node] : nodes)
|
||||
{
|
||||
node_names.push_back(name);
|
||||
node_types.push_back(node.getType());
|
||||
}
|
||||
|
||||
return std::make_shared<DataTypeTuple>(std::move(node_types), std::move(node_names));
|
||||
}
|
||||
};
|
||||
|
||||
Paths paths;
|
||||
};
|
||||
|
||||
bool checkIfTypesAreEqual(const DataTypes & types)
|
||||
{
|
||||
if (types.empty())
|
||||
@ -331,57 +523,6 @@ namespace
|
||||
}
|
||||
}
|
||||
|
||||
/// Merge all named Tuples and empty Maps (because empty JSON objects are inferred as empty Maps)
|
||||
/// to single Tuple with elements from all tuples. It's used to infer named Tuples from JSON objects.
|
||||
void mergeAllNamedTuplesAndEmptyMaps(DataTypes & data_types, TypeIndexesSet & type_indexes, const FormatSettings & settings, JSONInferenceInfo * json_info)
|
||||
{
|
||||
if (!type_indexes.contains(TypeIndex::Tuple))
|
||||
return;
|
||||
|
||||
/// Collect all names and their types from all named tuples.
|
||||
std::unordered_map<String, DataTypes> names_to_types;
|
||||
/// Try to save original order of element names.
|
||||
Names element_names;
|
||||
for (auto & type : data_types)
|
||||
{
|
||||
const auto * tuple_type = typeid_cast<const DataTypeTuple *>(type.get());
|
||||
if (tuple_type && tuple_type->haveExplicitNames())
|
||||
{
|
||||
const auto & elements = tuple_type->getElements();
|
||||
const auto & names = tuple_type->getElementNames();
|
||||
for (size_t i = 0; i != elements.size(); ++i)
|
||||
{
|
||||
if (!names_to_types.contains(names[i]))
|
||||
element_names.push_back(names[i]);
|
||||
names_to_types[names[i]].push_back(elements[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Try to find common type for each tuple element.
|
||||
DataTypes element_types;
|
||||
element_types.reserve(names_to_types.size());
|
||||
for (const auto & name : element_names)
|
||||
{
|
||||
auto types = names_to_types[name];
|
||||
transformInferredTypesIfNeededImpl<true>(types, settings, json_info);
|
||||
/// If some element have different types in different tuples, we can't do anything
|
||||
if (!checkIfTypesAreEqual(types))
|
||||
return;
|
||||
element_types.push_back(types.front());
|
||||
}
|
||||
|
||||
DataTypePtr result_tuple = std::make_shared<DataTypeTuple>(element_types, element_names);
|
||||
|
||||
for (auto & type : data_types)
|
||||
{
|
||||
const auto * tuple_type = typeid_cast<const DataTypeTuple *>(type.get());
|
||||
const auto * map_type = typeid_cast<const DataTypeMap *>(type.get());
|
||||
if ((tuple_type && tuple_type->haveExplicitNames()) || (map_type && isNothing(map_type->getKeyType()) && isNothing(map_type->getValueType())))
|
||||
type = result_tuple;
|
||||
}
|
||||
}
|
||||
|
||||
void transformMapsAndStringsToStrings(DataTypes & data_types, TypeIndexesSet & type_indexes)
|
||||
{
|
||||
/// Check if we have both String and Map
|
||||
@ -397,6 +538,26 @@ namespace
|
||||
type_indexes.erase(TypeIndex::Map);
|
||||
}
|
||||
|
||||
void mergeJSONPaths(DataTypes & data_types, TypeIndexesSet & type_indexes, const FormatSettings & settings, JSONInferenceInfo * json_info)
|
||||
{
|
||||
if (!type_indexes.contains(TypeIndex::JSONPaths))
|
||||
return;
|
||||
|
||||
std::shared_ptr<DataTypeJSONPaths> merged_type = std::make_shared<DataTypeJSONPaths>();
|
||||
auto transform_func = [&](DataTypePtr & type1, DataTypePtr & type2){ transformInferredJSONTypesIfNeeded(type1, type2, settings, json_info); };
|
||||
for (auto & type : data_types)
|
||||
{
|
||||
if (const auto * json_type = typeid_cast<const DataTypeJSONPaths *>(type.get()))
|
||||
merged_type->merge(*json_type, transform_func);
|
||||
}
|
||||
|
||||
for (auto & type : data_types)
|
||||
{
|
||||
if (type->getTypeId() == TypeIndex::JSONPaths)
|
||||
type = merged_type;
|
||||
}
|
||||
}
|
||||
|
||||
template <bool is_json>
|
||||
void transformInferredTypesIfNeededImpl(DataTypes & types, const FormatSettings & settings, JSONInferenceInfo * json_info)
|
||||
{
|
||||
@ -429,6 +590,9 @@ namespace
|
||||
/// Convert Bool to number (Int64/Float64) if needed.
|
||||
if (settings.json.read_bools_as_numbers)
|
||||
transformBoolsAndNumbersToNumbers(data_types, type_indexes);
|
||||
|
||||
if (settings.json.try_infer_objects_as_tuples)
|
||||
mergeJSONPaths(data_types, type_indexes, settings, json_info);
|
||||
};
|
||||
|
||||
auto transform_complex_types = [&](DataTypes & data_types, TypeIndexesSet & type_indexes)
|
||||
@ -449,9 +613,6 @@ namespace
|
||||
/// Convert JSON tuples and arrays to arrays if possible.
|
||||
transformJSONTuplesAndArraysToArrays(data_types, settings, type_indexes, json_info);
|
||||
|
||||
if (settings.json.try_infer_objects_as_tuples)
|
||||
mergeAllNamedTuplesAndEmptyMaps(data_types, type_indexes, settings, json_info);
|
||||
|
||||
if (settings.json.read_objects_as_strings)
|
||||
transformMapsAndStringsToStrings(data_types, type_indexes);
|
||||
};
|
||||
@ -780,6 +941,78 @@ namespace
|
||||
return std::make_shared<DataTypeString>();
|
||||
}
|
||||
|
||||
[[maybe_unused]] bool tryReadJSONObject(ReadBuffer & buf, const FormatSettings & settings, DataTypeJSONPaths::Paths & paths, const std::vector<String> & path, JSONInferenceInfo * json_info, size_t depth)
|
||||
{
|
||||
if (depth > settings.max_parser_depth)
|
||||
throw Exception(ErrorCodes::TOO_DEEP_RECURSION,
|
||||
"Maximum parse depth ({}) exceeded. Consider rising max_parser_depth setting.", settings.max_parser_depth);
|
||||
|
||||
assertChar('{', buf);
|
||||
skipWhitespaceIfAny(buf);
|
||||
bool first = true;
|
||||
while (!buf.eof() && *buf.position() != '}')
|
||||
{
|
||||
if (!first)
|
||||
{
|
||||
if (!checkChar(',', buf))
|
||||
return false;
|
||||
skipWhitespaceIfAny(buf);
|
||||
}
|
||||
else
|
||||
first = false;
|
||||
|
||||
String key;
|
||||
if (!tryReadJSONStringInto(key, buf))
|
||||
return false;
|
||||
|
||||
std::vector<String> current_path = path;
|
||||
current_path.push_back(key);
|
||||
skipWhitespaceIfAny(buf);
|
||||
if (!checkChar(':', buf))
|
||||
return false;
|
||||
|
||||
skipWhitespaceIfAny(buf);
|
||||
|
||||
if (!buf.eof() && *buf.position() == '{')
|
||||
{
|
||||
if (!tryReadJSONObject(buf, settings, paths, current_path, json_info, depth + 1))
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto value_type = tryInferDataTypeForSingleFieldImpl<true>(buf, settings, json_info, depth + 1);
|
||||
if (!value_type)
|
||||
return false;
|
||||
|
||||
paths[current_path] = value_type;
|
||||
}
|
||||
|
||||
skipWhitespaceIfAny(buf);
|
||||
}
|
||||
|
||||
/// No '}' at the end.
|
||||
if (buf.eof())
|
||||
return false;
|
||||
|
||||
assertChar('}', buf);
|
||||
skipWhitespaceIfAny(buf);
|
||||
|
||||
/// If it was empty object and it's not root object, treat it as null, so we won't
|
||||
/// lose this path if this key contains empty object in all sample data.
|
||||
/// This case will be processed in JSONPaths type during finalize.
|
||||
if (first && !path.empty())
|
||||
paths[path] = std::make_shared<DataTypeNothing>();
|
||||
return true;
|
||||
}
|
||||
|
||||
[[maybe_unused]] DataTypePtr tryInferJSONPaths(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info, size_t depth)
|
||||
{
|
||||
DataTypeJSONPaths::Paths paths;
|
||||
if (!tryReadJSONObject(buf, settings, paths, {}, json_info, depth))
|
||||
return nullptr;
|
||||
return std::make_shared<DataTypeJSONPaths>(std::move(paths));
|
||||
}
|
||||
|
||||
template <bool is_json>
|
||||
DataTypePtr tryInferMapOrObject(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info, size_t depth)
|
||||
{
|
||||
@ -862,9 +1095,6 @@ namespace
|
||||
if (settings.json.allow_object_type)
|
||||
return std::make_shared<DataTypeObject>("json", true);
|
||||
|
||||
if (settings.json.try_infer_objects_as_tuples)
|
||||
return std::make_shared<DataTypeTuple>(value_types, json_keys);
|
||||
|
||||
if (settings.json.read_objects_as_strings)
|
||||
return std::make_shared<DataTypeString>();
|
||||
|
||||
@ -915,7 +1145,15 @@ namespace
|
||||
|
||||
/// Map/Object for JSON { key1 : value1, key2 : value2, ...}
|
||||
if (*buf.position() == '{')
|
||||
{
|
||||
if constexpr (is_json)
|
||||
{
|
||||
if (!settings.json.allow_object_type && settings.json.try_infer_objects_as_tuples)
|
||||
return tryInferJSONPaths(buf, settings, json_info, depth);
|
||||
}
|
||||
|
||||
return tryInferMapOrObject<is_json>(buf, settings, json_info, depth);
|
||||
}
|
||||
|
||||
/// String
|
||||
char quote = is_json ? '"' : '\'';
|
||||
@ -957,24 +1195,58 @@ void transformInferredJSONTypesIfNeeded(
|
||||
second = std::move(types[1]);
|
||||
}
|
||||
|
||||
void transformJSONTupleToArrayIfPossible(DataTypePtr & data_type, const FormatSettings & settings, JSONInferenceInfo * json_info)
|
||||
void transformFinalInferredJSONTypeIfNeeded(DataTypePtr & data_type, const FormatSettings & settings, JSONInferenceInfo * json_info)
|
||||
{
|
||||
if (!data_type)
|
||||
return;
|
||||
|
||||
if (isNothing(data_type) && settings.json.infer_incomplete_types_as_strings)
|
||||
{
|
||||
data_type = std::make_shared<DataTypeString>();
|
||||
return;
|
||||
}
|
||||
|
||||
if (const auto * nullable_type = typeid_cast<const DataTypeNullable *>(data_type.get()))
|
||||
{
|
||||
auto nested_type = nullable_type->getNestedType();
|
||||
transformFinalInferredJSONTypeIfNeeded(nested_type, settings, json_info);
|
||||
data_type = std::make_shared<DataTypeNullable>(std::move(nested_type));
|
||||
return;
|
||||
}
|
||||
|
||||
if (const auto * json_paths = typeid_cast<const DataTypeJSONPaths *>(data_type.get()))
|
||||
{
|
||||
/// If all objects were empty, use type String, so these JSON objects will be read as Strings.
|
||||
if (json_paths->empty() && settings.json.infer_incomplete_types_as_strings)
|
||||
{
|
||||
data_type = std::make_shared<DataTypeString>();
|
||||
return;
|
||||
}
|
||||
|
||||
data_type = json_paths->finalize();
|
||||
transformFinalInferredJSONTypeIfNeeded(data_type, settings, json_info);
|
||||
return;
|
||||
}
|
||||
|
||||
if (const auto * array_type = typeid_cast<const DataTypeArray *>(data_type.get()))
|
||||
{
|
||||
auto nested_type = array_type->getNestedType();
|
||||
transformJSONTupleToArrayIfPossible(nested_type, settings, json_info);
|
||||
transformFinalInferredJSONTypeIfNeeded(nested_type, settings, json_info);
|
||||
data_type = std::make_shared<DataTypeArray>(nested_type);
|
||||
return;
|
||||
}
|
||||
|
||||
if (const auto * map_type = typeid_cast<const DataTypeMap *>(data_type.get()))
|
||||
{
|
||||
auto key_type = map_type->getKeyType();
|
||||
/// If all inferred Maps are empty, use type String, so these JSON objects will be read as Strings.
|
||||
if (isNothing(key_type) && settings.json.infer_incomplete_types_as_strings)
|
||||
key_type = std::make_shared<DataTypeString>();
|
||||
|
||||
auto value_type = map_type->getValueType();
|
||||
transformJSONTupleToArrayIfPossible(value_type, settings, json_info);
|
||||
data_type = std::make_shared<DataTypeMap>(map_type->getKeyType(), value_type);
|
||||
|
||||
transformFinalInferredJSONTypeIfNeeded(value_type, settings, json_info);
|
||||
data_type = std::make_shared<DataTypeMap>(key_type, value_type);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -982,7 +1254,7 @@ void transformJSONTupleToArrayIfPossible(DataTypePtr & data_type, const FormatSe
|
||||
{
|
||||
auto nested_types = tuple_type->getElements();
|
||||
for (auto & nested_type : nested_types)
|
||||
transformJSONTupleToArrayIfPossible(nested_type, settings, json_info);
|
||||
transformFinalInferredJSONTypeIfNeeded(nested_type, settings, json_info);
|
||||
|
||||
if (tuple_type->haveExplicitNames())
|
||||
{
|
||||
|
@ -69,11 +69,14 @@ void transformInferredTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, c
|
||||
/// we will convert both types to Object('JSON').
|
||||
void transformInferredJSONTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings, JSONInferenceInfo * json_info);
|
||||
|
||||
/// Check if type is Tuple(...), try to transform nested types to find a common type for them and if all nested types
|
||||
/// are the same after transform, we convert this tuple to an Array with common nested type.
|
||||
/// For example, if we have Tuple(String, Nullable(Nothing)) we will convert it to Array(String).
|
||||
/// It's used when all rows were read and we have Tuple in the result type that can be actually an Array.
|
||||
void transformJSONTupleToArrayIfPossible(DataTypePtr & data_type, const FormatSettings & settings, JSONInferenceInfo * json_info);
|
||||
/// Make final transform for types inferred in JSON format. It does 23 types of transformation:
|
||||
/// 1) Checks if type is Tuple(...), tries to transform nested types to find a common type for them and if all nested types
|
||||
/// are the same after transform, it converts this tuple to an Array with common nested type.
|
||||
/// For example, if we have Tuple(String, Nullable(Nothing)) we will convert it to Array(String).
|
||||
/// It's used when all rows were read and we have Tuple in the result type that can be actually an Array.
|
||||
/// 2) Finalizes all DataTypeJSONPaths to named Tuple.
|
||||
/// 3) Converts all Nothing types to String types if input_format_json_infer_incomplete_types_as_strings is enabled.
|
||||
void transformFinalInferredJSONTypeIfNeeded(DataTypePtr & data_type, const FormatSettings & settings, JSONInferenceInfo * json_info);
|
||||
|
||||
/// Make type Nullable recursively:
|
||||
/// - Type -> Nullable(type)
|
||||
|
@ -989,8 +989,8 @@ template void readJSONStringInto<NullOutput>(NullOutput & s, ReadBuffer & buf);
|
||||
template void readJSONStringInto<String>(String & s, ReadBuffer & buf);
|
||||
template bool readJSONStringInto<String, bool>(String & s, ReadBuffer & buf);
|
||||
|
||||
template <typename Vector, typename ReturnType>
|
||||
ReturnType readJSONObjectPossiblyInvalid(Vector & s, ReadBuffer & buf)
|
||||
template <typename Vector, typename ReturnType, char opening_bracket, char closing_bracket>
|
||||
ReturnType readJSONObjectOrArrayPossiblyInvalid(Vector & s, ReadBuffer & buf)
|
||||
{
|
||||
static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
|
||||
|
||||
@ -1001,8 +1001,8 @@ ReturnType readJSONObjectPossiblyInvalid(Vector & s, ReadBuffer & buf)
|
||||
return ReturnType(false);
|
||||
};
|
||||
|
||||
if (buf.eof() || *buf.position() != '{')
|
||||
return error("JSON should start from opening curly bracket", ErrorCodes::INCORRECT_DATA);
|
||||
if (buf.eof() || *buf.position() != opening_bracket)
|
||||
return error("JSON object/array should start with corresponding opening bracket", ErrorCodes::INCORRECT_DATA);
|
||||
|
||||
s.push_back(*buf.position());
|
||||
++buf.position();
|
||||
@ -1012,7 +1012,7 @@ ReturnType readJSONObjectPossiblyInvalid(Vector & s, ReadBuffer & buf)
|
||||
|
||||
while (!buf.eof())
|
||||
{
|
||||
char * next_pos = find_first_symbols<'\\', '{', '}', '"'>(buf.position(), buf.buffer().end());
|
||||
char * next_pos = find_first_symbols<'\\', opening_bracket, closing_bracket, '"'>(buf.position(), buf.buffer().end());
|
||||
appendToStringOrVector(s, buf, next_pos);
|
||||
buf.position() = next_pos;
|
||||
|
||||
@ -1035,23 +1035,34 @@ ReturnType readJSONObjectPossiblyInvalid(Vector & s, ReadBuffer & buf)
|
||||
|
||||
if (*buf.position() == '"')
|
||||
quotes = !quotes;
|
||||
else if (!quotes) // can be only '{' or '}'
|
||||
balance += *buf.position() == '{' ? 1 : -1;
|
||||
else if (!quotes) // can be only opening_bracket or closing_bracket
|
||||
balance += *buf.position() == opening_bracket ? 1 : -1;
|
||||
|
||||
++buf.position();
|
||||
|
||||
if (balance == 0)
|
||||
return ReturnType(true);
|
||||
|
||||
if (balance < 0)
|
||||
if (balance < 0)
|
||||
break;
|
||||
}
|
||||
|
||||
return error("JSON should have equal number of opening and closing brackets", ErrorCodes::INCORRECT_DATA);
|
||||
return error("JSON object/array should have equal number of opening and closing brackets", ErrorCodes::INCORRECT_DATA);
|
||||
}
|
||||
|
||||
template <typename Vector, typename ReturnType>
|
||||
ReturnType readJSONObjectPossiblyInvalid(Vector & s, ReadBuffer & buf)
|
||||
{
|
||||
return readJSONObjectOrArrayPossiblyInvalid<Vector, ReturnType, '{', '}'>(s, buf);
|
||||
}
|
||||
|
||||
template void readJSONObjectPossiblyInvalid<String>(String & s, ReadBuffer & buf);
|
||||
|
||||
void readJSONArrayInto(String & s, ReadBuffer & buf)
|
||||
{
|
||||
readJSONObjectOrArrayPossiblyInvalid<String, void, '[', ']'>(s, buf);
|
||||
}
|
||||
|
||||
template <typename ReturnType>
|
||||
ReturnType readDateTextFallback(LocalDate & date, ReadBuffer & buf)
|
||||
{
|
||||
|
@ -637,6 +637,8 @@ bool tryReadQuotedStringInto(Vector & s, ReadBuffer & buf);
|
||||
template <typename Vector, typename ReturnType = void>
|
||||
ReturnType readJSONObjectPossiblyInvalid(Vector & s, ReadBuffer & buf);
|
||||
|
||||
void readJSONArrayInto(String & s, ReadBuffer & buf);
|
||||
|
||||
template <typename Vector>
|
||||
void readStringUntilWhitespaceInto(Vector & s, ReadBuffer & buf);
|
||||
|
||||
|
@ -294,7 +294,7 @@ NamesAndTypesList JSONColumnsSchemaReaderBase::readSchema()
|
||||
/// Don't check/change types from hints.
|
||||
if (!hints.contains(name))
|
||||
{
|
||||
transformJSONTupleToArrayIfPossible(type, format_settings, &inference_info);
|
||||
transformFinalInferredJSONTypeIfNeeded(type, format_settings, &inference_info);
|
||||
/// Check that we could determine the type of this column.
|
||||
checkFinalInferredType(type, name, format_settings, nullptr, format_settings.max_rows_to_read_for_schema_inference, hints_parsing_error);
|
||||
}
|
||||
|
@ -230,7 +230,7 @@ void JSONCompactEachRowRowSchemaReader::transformTypesIfNeeded(DataTypePtr & typ
|
||||
|
||||
void JSONCompactEachRowRowSchemaReader::transformFinalTypeIfNeeded(DataTypePtr & type)
|
||||
{
|
||||
transformJSONTupleToArrayIfPossible(type, format_settings, &inference_info);
|
||||
transformFinalInferredJSONTypeIfNeeded(type, format_settings, &inference_info);
|
||||
}
|
||||
|
||||
void registerInputFormatJSONCompactEachRow(FormatFactory & factory)
|
||||
|
@ -367,7 +367,7 @@ void JSONEachRowSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTyp
|
||||
|
||||
void JSONEachRowSchemaReader::transformFinalTypeIfNeeded(DataTypePtr & type)
|
||||
{
|
||||
transformJSONTupleToArrayIfPossible(type, format_settings, &inference_info);
|
||||
transformFinalInferredJSONTypeIfNeeded(type, format_settings, &inference_info);
|
||||
}
|
||||
|
||||
void registerInputFormatJSONEachRow(FormatFactory & factory)
|
||||
|
@ -109,7 +109,7 @@ void JSONObjectEachRowSchemaReader::transformTypesIfNeeded(DataTypePtr & type, D
|
||||
|
||||
void JSONObjectEachRowSchemaReader::transformFinalTypeIfNeeded(DataTypePtr & type)
|
||||
{
|
||||
transformJSONTupleToArrayIfPossible(type, format_settings, &inference_info);
|
||||
transformFinalInferredJSONTypeIfNeeded(type, format_settings, &inference_info);
|
||||
}
|
||||
|
||||
void registerInputFormatJSONObjectEachRow(FormatFactory & factory)
|
||||
|
@ -5,7 +5,7 @@ insert into table function hdfs('hdfs://localhost:12222/test_02536.jsonl', 'TSV'
|
||||
set input_format_json_try_infer_named_tuples_from_objects=0;
|
||||
drop table if exists test;
|
||||
create table test (x Tuple(a UInt32, b UInt32)) engine=Memory();
|
||||
insert into test select * from hdfsCluster('test_cluster_two_shards_localhost', 'hdfs://localhost:12222/test_02536.jsonl') settings use_structure_from_insertion_table_in_table_functions=0; -- {serverError TYPE_MISMATCH}
|
||||
insert into test select * from hdfsCluster('test_cluster_two_shards_localhost', 'hdfs://localhost:12222/test_02536.jsonl') settings use_structure_from_insertion_table_in_table_functions=0; -- {serverError ILLEGAL_COLUMN}
|
||||
insert into test select * from hdfsCluster('test_cluster_two_shards_localhost', 'hdfs://localhost:12222/test_02536.jsonl') settings use_structure_from_insertion_table_in_table_functions=1;
|
||||
select * from test;
|
||||
drop table test;
|
||||
|
@ -19,3 +19,16 @@ obj Tuple(nested_obj Tuple(a Nullable(Int64), b Nullable(String), c Array(Nullab
|
||||
((42,'Hello',[1,2,3],NULL))
|
||||
((43,'World',[],'2020-01-01'))
|
||||
((NULL,NULL,[],NULL))
|
||||
obj Tuple(a Tuple(b Nullable(Int64)), `a.b` Nullable(Int64), `a.b.c` Nullable(String))
|
||||
((1),NULL,NULL)
|
||||
((NULL),2,'Hello')
|
||||
obj Tuple(a Tuple(b Tuple(c Nullable(Int64))))
|
||||
(((NULL)))
|
||||
(((10)))
|
||||
obj Tuple(a Nullable(String))
|
||||
('{}')
|
||||
obj Nullable(String)
|
||||
{}
|
||||
obj Tuple(a Array(Tuple(b Array(Nullable(Int64)), c Tuple(d Nullable(Int64)), e Nullable(String))))
|
||||
([([],(NULL),NULL),([],(NULL),NULL),([],(10),NULL)])
|
||||
([([1,2,3],(NULL),'Hello')])
|
||||
|
@ -11,4 +11,14 @@ desc format(JSONEachRow, '{"obj" : [{"a" : 42, "b" : "Hello", "c" : [1,2,3]}, {"
|
||||
select * from format(JSONEachRow, '{"obj" : [{"a" : 42, "b" : "Hello", "c" : [1,2,3]}, {"a" : 43, "b" : "World", "d" : "2020-01-01"}]}, {"obj" : [{}]}');
|
||||
desc format(JSONEachRow, '{"obj" : {"nested_obj" : {"a" : 42, "b" : "Hello", "c" : [1,2,3]}}}, {"obj" : {"nested_obj" : {"a" : 43, "b" : "World", "d" : "2020-01-01"}}}, {"obj" : {"nested_obj" : {}}}');
|
||||
select * from format(JSONEachRow, '{"obj" : {"nested_obj" : {"a" : 42, "b" : "Hello", "c" : [1,2,3]}}}, {"obj" : {"nested_obj" : {"a" : 43, "b" : "World", "d" : "2020-01-01"}}}, {"obj" : {"nested_obj" : {}}}');
|
||||
desc format(JSONEachRow, '{"obj" : {"a" : {"b" : 1}}}, {"obj" : {"a.b" : 2, "a.b.c" : "Hello"}}');
|
||||
select * from format(JSONEachRow, '{"obj" : {"a" : {"b" : 1}}}, {"obj" : {"a.b" : 2, "a.b.c" : "Hello"}}');
|
||||
desc format(JSONEachRow, '{"obj" : {"a" : {}}}, {"obj" : {"a" : {"b" : {"c" : 10}}}}');
|
||||
select * from format(JSONEachRow, '{"obj" : {"a" : {}}}, {"obj" : {"a" : {"b" : {"c" : 10}}}}');
|
||||
desc format(JSONEachRow, '{"obj" : {"a" : {}}}');
|
||||
select * from format(JSONEachRow, '{"obj" : {"a" : {}}}');
|
||||
desc format(JSONEachRow, '{"obj" : {}}');
|
||||
select * from format(JSONEachRow, '{"obj" : {}}');
|
||||
desc format(JSONEachRow, '{"obj" : {"a" : [{}, {"b" : null}, {"c" : {"d" : 10}}]}}, {"obj" : {"a" : [{"e" : "Hello", "b" : [1,2,3]}]}}');
|
||||
select * from format(JSONEachRow, '{"obj" : {"a" : [{}, {"b" : null}, {"c" : {"d" : 10}}]}}, {"obj" : {"a" : [{"e" : "Hello", "b" : [1,2,3]}]}}');
|
||||
|
||||
|
@ -0,0 +1 @@
|
||||
[1, "Hello", [1,2,3]]
|
2
tests/queries/0_stateless/02875_json_array_as_string.sql
Normal file
2
tests/queries/0_stateless/02875_json_array_as_string.sql
Normal file
@ -0,0 +1,2 @@
|
||||
set input_format_json_read_arrays_as_strings = 1;
|
||||
select * from format(JSONEachRow, 'arr String', '{"arr" : [1, "Hello", [1,2,3]]}');
|
@ -0,0 +1,7 @@
|
||||
a Nullable(String)
|
||||
b Nullable(String)
|
||||
c Array(Nullable(String))
|
||||
\N {} []
|
||||
a Tuple(b Nullable(String), c Array(Array(Nullable(String))))
|
||||
d Tuple(e Array(Nullable(String)), f Nullable(String))
|
||||
(NULL,[[],[]]) (['{}','{}'],NULL)
|
@ -0,0 +1,6 @@
|
||||
set input_format_json_infer_incomplete_types_as_strings=1;
|
||||
desc format(JSONEachRow, '{"a" : null, "b" : {}, "c" : []}');
|
||||
select * from format(JSONEachRow, '{"a" : null, "b" : {}, "c" : []}');
|
||||
desc format(JSONEachRow, '{"a" : {"b" : null, "c" : [[], []]}, "d" : {"e" : [{}, {}], "f" : null}}');
|
||||
select * from format(JSONEachRow, '{"a" : {"b" : null, "c" : [[], []]}, "d" : {"e" : [{}, {}], "f" : null}}');
|
||||
|
Loading…
Reference in New Issue
Block a user