Merge pull request #45231 from Avogar/json-tuples

Insert default values in case of missing tuple elements in JSONEachRow
This commit is contained in:
Kruglov Pavel 2023-01-16 17:49:50 +01:00 committed by GitHub
commit bdb3517512
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 85 additions and 11 deletions

View File

@ -1203,12 +1203,14 @@ SELECT * FROM json_each_row_nested
- [input_format_json_read_bools_as_numbers](/docs/en/operations/settings/settings.md/#input_format_json_read_bools_as_numbers) - allow to parse bools as numbers in JSON input formats. Default value - `true`.
- [input_format_json_read_numbers_as_strings](/docs/en/operations/settings/settings.md/#input_format_json_read_numbers_as_strings) - allow to parse numbers as strings in JSON input formats. Default value - `false`.
- [input_format_json_read_objects_as_strings](/docs/en/operations/settings/settings.md/#input_format_json_read_objects_as_strings) - allow to parse JSON objects as strings in JSON input formats. Default value - `false`.
- [input_format_json_named_tuples_as_objects](/docs/en/operations/settings/settings.md/#input_format_json_named_tuples_as_objects) - parse named tuple columns as JSON objects. Default value - `true`.
- [input_format_json_defaults_for_missing_elements_in_named_tuple](/docs/en/operations/settings/settings.md/#input_format_json_defaults_for_missing_elements_in_named_tuple) - insert default values for missing elements in JSON object while parsing named tuple. Default value - `true`.
- [output_format_json_quote_64bit_integers](/docs/en/operations/settings/settings.md/#output_format_json_quote_64bit_integers) - controls quoting of 64-bit integers in JSON output format. Default value - `true`.
- [output_format_json_quote_64bit_floats](/docs/en/operations/settings/settings.md/#output_format_json_quote_64bit_floats) - controls quoting of 64-bit floats in JSON output format. Default value - `false`.
- [output_format_json_quote_denormals](/docs/en/operations/settings/settings.md/#output_format_json_quote_denormals) - enables '+nan', '-nan', '+inf', '-inf' outputs in JSON output format. Default value - `false`.
- [output_format_json_quote_decimals](/docs/en/operations/settings/settings.md/#output_format_json_quote_decimals) - controls quoting of decimals in JSON output format. Default value - `false`.
- [output_format_json_escape_forward_slashes](/docs/en/operations/settings/settings.md/#output_format_json_escape_forward_slashes) - controls escaping forward slashes for string outputs in JSON output format. Default value - `true`.
- [output_format_json_named_tuples_as_objects](/docs/en/operations/settings/settings.md/#output_format_json_named_tuples_as_objects) - serialize named tuple columns as JSON objects. Default value - `false`.
- [output_format_json_named_tuples_as_objects](/docs/en/operations/settings/settings.md/#output_format_json_named_tuples_as_objects) - serialize named tuple columns as JSON objects. Default value - `true`.
- [output_format_json_array_of_rows](/docs/en/operations/settings/settings.md/#output_format_json_array_of_rows) - output a JSON array of all rows in JSONEachRow(Compact) format. Default value - `false`.
- [output_format_json_validate_utf8](/docs/en/operations/settings/settings.md/#output_format_json_validate_utf8) - enables validation of UTF-8 sequences in JSON output formats (note that it doesn't impact formats JSON/JSONCompact/JSONColumnsWithMetadata, they always validate utf8). Default value - `false`.

View File

@ -4122,7 +4122,20 @@ Enabled by default.
Serialize named tuple columns as JSON objects.
Disabled by default.
Enabled by default.
### input_format_json_named_tuples_as_objects {#input_format_json_named_tuples_as_objects}
Parse named tuple columns as JSON objects.
Enabled by default.
### input_format_json_defaults_for_missing_elements_in_named_tuple {#input_format_json_defaults_for_missing_elements_in_named_tuple}
Insert default values for missing elements in JSON object while parsing named tuple.
This setting works only when setting `input_format_json_named_tuples_as_objects` is enabled.
Enabled by default.
### output_format_json_array_of_rows {#output_format_json_array_of_rows}

View File

@ -773,6 +773,8 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
M(Bool, input_format_json_validate_types_from_metadata, true, "For JSON/JSONCompact/JSONColumnsWithMetadata input formats this controls whether format parser should check if data types from input metadata match data types of the corresponding columns from the table", 0) \
M(Bool, input_format_json_read_numbers_as_strings, false, "Allow to parse numbers as strings in JSON input formats", 0) \
M(Bool, input_format_json_read_objects_as_strings, true, "Allow to parse JSON objects as strings in JSON input formats", 0) \
M(Bool, input_format_json_named_tuples_as_objects, true, "Deserialize named tuple columns as JSON objects", 0) \
M(Bool, input_format_json_defaults_for_missing_elements_in_named_tuple, true, "Insert default value in named tuple element if it's missing in json object", 0) \
M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \
M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \
M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \

View File

@ -80,7 +80,8 @@ namespace SettingsChangesHistory
/// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972)
static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> settings_changes_history =
{
{"23.1", {{"input_format_json_read_objects_as_strings", 0, 1, "Enable reading nested json objects as strings while object type is experimental"}}},
{"23.1", {{"input_format_json_read_objects_as_strings", 0, 1, "Enable reading nested json objects as strings while object type is experimental"},
{"input_format_json_defaults_for_missing_elements_in_named_tuple", false, true, "Allow missing elements in JSON objects while reading named tuples by default"}}},
{"22.12", {{"max_size_to_preallocate_for_aggregation", 10'000'000, 100'000'000, "This optimizes performance"},
{"query_plan_aggregation_in_order", 0, 1, "Enable some refactoring around query plan"},
{"format_binary_max_string_size", 0, 1_GiB, "Prevent allocating large amount of memory"}}},

View File

@ -16,6 +16,7 @@ namespace ErrorCodes
{
extern const int SIZES_OF_COLUMNS_IN_TUPLE_DOESNT_MATCH;
extern const int NOT_FOUND_COLUMN_IN_BLOCK;
extern const int INCORRECT_DATA;
}
@ -154,7 +155,7 @@ void SerializationTuple::deserializeText(IColumn & column, ReadBuffer & istr, co
void SerializationTuple::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
if (settings.json.named_tuples_as_objects
if (settings.json.write_named_tuples_as_objects
&& have_explicit_names)
{
writeChar('{', ostr);
@ -185,7 +186,7 @@ void SerializationTuple::serializeTextJSON(const IColumn & column, size_t row_nu
void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
if (settings.json.named_tuples_as_objects
if (settings.json.read_named_tuples_as_objects
&& have_explicit_names)
{
skipWhitespaceIfAny(istr);
@ -194,12 +195,15 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr
addElementSafe(elems.size(), column, [&]
{
// Require all elements but in arbitrary order.
for (size_t i = 0; i < elems.size(); ++i)
std::vector<UInt8> seen_elements(elems.size(), 0);
size_t i = 0;
while (!istr.eof() && *istr.position() != '}')
{
if (i == elems.size())
throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected number of elements in named tuple. Expected no more than {}", elems.size());
if (i > 0)
{
skipWhitespaceIfAny(istr);
assertChar(',', istr);
skipWhitespaceIfAny(istr);
}
@ -211,12 +215,35 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr
skipWhitespaceIfAny(istr);
const size_t element_pos = getPositionByName(name);
seen_elements[element_pos] = 1;
auto & element_column = extractElementColumn(column, element_pos);
elems[element_pos]->deserializeTextJSON(element_column, istr, settings);
skipWhitespaceIfAny(istr);
++i;
}
skipWhitespaceIfAny(istr);
assertChar('}', istr);
/// Check if we have missing elements.
if (i != elems.size())
{
for (size_t element_pos = 0; element_pos != seen_elements.size(); ++element_pos)
{
if (seen_elements[element_pos])
continue;
if (!settings.json.defaults_for_missing_elements_in_named_tuple)
throw Exception(
ErrorCodes::INCORRECT_DATA,
"JSON object doesn't contain tuple element {}. If you want to insert defaults in case of missing elements, "
"enable setting input_format_json_defaults_for_missing_elements_in_named_tuple",
elems[element_pos]->getElementName());
auto & element_column = extractElementColumn(column, element_pos);
element_column.insertDefault();
}
}
});
}
else

View File

@ -90,7 +90,9 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.input_allow_errors_ratio = settings.input_format_allow_errors_ratio;
format_settings.json.array_of_rows = settings.output_format_json_array_of_rows;
format_settings.json.escape_forward_slashes = settings.output_format_json_escape_forward_slashes;
format_settings.json.named_tuples_as_objects = settings.output_format_json_named_tuples_as_objects;
format_settings.json.write_named_tuples_as_objects = settings.output_format_json_named_tuples_as_objects;
format_settings.json.read_named_tuples_as_objects = settings.input_format_json_named_tuples_as_objects;
format_settings.json.defaults_for_missing_elements_in_named_tuple = settings.input_format_json_defaults_for_missing_elements_in_named_tuple;
format_settings.json.quote_64bit_integers = settings.output_format_json_quote_64bit_integers;
format_settings.json.quote_64bit_floats = settings.output_format_json_quote_64bit_floats;
format_settings.json.quote_denormals = settings.output_format_json_quote_denormals;

View File

@ -153,7 +153,9 @@ struct FormatSettings
bool quote_denormals = true;
bool quote_decimals = false;
bool escape_forward_slashes = true;
bool named_tuples_as_objects = false;
bool read_named_tuples_as_objects = false;
bool write_named_tuples_as_objects = false;
bool defaults_for_missing_elements_in_named_tuple = false;
bool serialize_as_strings = false;
bool read_bools_as_numbers = true;
bool read_numbers_as_strings = true;

View File

@ -0,0 +1,6 @@
(1,2,NULL)
(1,2,NULL)
(NULL,NULL,NULL)
1
1
1

View File

@ -0,0 +1,19 @@
#!/usr/bin/env bash
# Tags: long
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
echo '{"t" : {"a" : 1, "b" : 2}}' | $CLICKHOUSE_LOCAL --input-format=NDJSON --structure='t Tuple(a Nullable(UInt32), b Nullable(UInt32), c Nullable(UInt32))' -q "select * from table"
echo '{"t" : { "a" : 1 , "b" : 2 } }' | $CLICKHOUSE_LOCAL --input-format=NDJSON --structure='t Tuple(a Nullable(UInt32), b Nullable(UInt32), c Nullable(UInt32))' -q "select * from table"
echo '{"t" : {}}' | $CLICKHOUSE_LOCAL --input-format=NDJSON --structure='t Tuple(a Nullable(UInt32), b Nullable(UInt32), c Nullable(UInt32))' -q "select * from table"
echo '{"t" : {"a" : 1, "b" : 2}}' | $CLICKHOUSE_LOCAL --input-format=NDJSON --structure='t Tuple(a Nullable(UInt32), b Nullable(UInt32), c Nullable(UInt32))' -q "select * from table" --input_format_json_defaults_for_missing_elements_in_named_tuple=0 2>&1 | grep -F "INCORRECT_DATA" -c
echo '{"t" : {"a" : 1, "d" : 2}}' | $CLICKHOUSE_LOCAL --input-format=NDJSON --structure='t Tuple(a Nullable(UInt32), b Nullable(UInt32), c Nullable(UInt32))' -q "select * from table" 2>&1 | grep -F "NOT_FOUND_COLUMN_IN_BLOCK" -c
echo '{"t" : {"a" : 1, "b" : 2, "c" : 3, "d" : 4}}' | $CLICKHOUSE_LOCAL --input-format=NDJSON --structure='t Tuple(a Nullable(UInt32), b Nullable(UInt32), c Nullable(UInt32))' -q "select * from table" 2>&1 | grep -F "INCORRECT_DATA" -c