From 87b934c4725dc69345bf4981483e2e08f70ba76d Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 12 Jan 2023 16:36:44 +0000 Subject: [PATCH] Insert default values in case of missing tuple elements in JSONEachRow --- docs/en/interfaces/formats.md | 4 +- docs/en/operations/settings/settings.md | 15 ++++++- src/Core/Settings.h | 2 + src/Core/SettingsChangesHistory.h | 3 +- .../Serializations/SerializationTuple.cpp | 39 ++++++++++++++++--- src/Formats/FormatFactory.cpp | 4 +- src/Formats/FormatSettings.h | 4 +- ...son_missing_named_tuple_elements.reference | 6 +++ ...02532_json_missing_named_tuple_elements.sh | 19 +++++++++ 9 files changed, 85 insertions(+), 11 deletions(-) create mode 100644 tests/queries/0_stateless/02532_json_missing_named_tuple_elements.reference create mode 100755 tests/queries/0_stateless/02532_json_missing_named_tuple_elements.sh diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index e28c486afca..d384ed639eb 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -1203,12 +1203,14 @@ SELECT * FROM json_each_row_nested - [input_format_json_read_bools_as_numbers](/docs/en/operations/settings/settings.md/#input_format_json_read_bools_as_numbers) - allow to parse bools as numbers in JSON input formats. Default value - `true`. - [input_format_json_read_numbers_as_strings](/docs/en/operations/settings/settings.md/#input_format_json_read_numbers_as_strings) - allow to parse numbers as strings in JSON input formats. Default value - `false`. - [input_format_json_read_objects_as_strings](/docs/en/operations/settings/settings.md/#input_format_json_read_objects_as_strings) - allow to parse JSON objects as strings in JSON input formats. Default value - `false`. +- [input_format_json_named_tuples_as_objects](/docs/en/operations/settings/settings.md/#input_format_json_named_tuples_as_objects) - parse named tuple columns as JSON objects. Default value - `true`. +- [input_format_json_defaults_for_missing_elements_in_named_tuple](/docs/en/operations/settings/settings.md/#input_format_json_defaults_for_missing_elements_in_named_tuple) - insert default values for missing elements in JSON object while parsing named tuple. Default value - `true`. - [output_format_json_quote_64bit_integers](/docs/en/operations/settings/settings.md/#output_format_json_quote_64bit_integers) - controls quoting of 64-bit integers in JSON output format. Default value - `true`. - [output_format_json_quote_64bit_floats](/docs/en/operations/settings/settings.md/#output_format_json_quote_64bit_floats) - controls quoting of 64-bit floats in JSON output format. Default value - `false`. - [output_format_json_quote_denormals](/docs/en/operations/settings/settings.md/#output_format_json_quote_denormals) - enables '+nan', '-nan', '+inf', '-inf' outputs in JSON output format. Default value - `false`. - [output_format_json_quote_decimals](/docs/en/operations/settings/settings.md/#output_format_json_quote_decimals) - controls quoting of decimals in JSON output format. Default value - `false`. - [output_format_json_escape_forward_slashes](/docs/en/operations/settings/settings.md/#output_format_json_escape_forward_slashes) - controls escaping forward slashes for string outputs in JSON output format. Default value - `true`. -- [output_format_json_named_tuples_as_objects](/docs/en/operations/settings/settings.md/#output_format_json_named_tuples_as_objects) - serialize named tuple columns as JSON objects. Default value - `false`. +- [output_format_json_named_tuples_as_objects](/docs/en/operations/settings/settings.md/#output_format_json_named_tuples_as_objects) - serialize named tuple columns as JSON objects. Default value - `true`. - [output_format_json_array_of_rows](/docs/en/operations/settings/settings.md/#output_format_json_array_of_rows) - output a JSON array of all rows in JSONEachRow(Compact) format. Default value - `false`. - [output_format_json_validate_utf8](/docs/en/operations/settings/settings.md/#output_format_json_validate_utf8) - enables validation of UTF-8 sequences in JSON output formats (note that it doesn't impact formats JSON/JSONCompact/JSONColumnsWithMetadata, they always validate utf8). Default value - `false`. diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 4ffe2bbc7c4..53cc9f1e349 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -4122,7 +4122,20 @@ Enabled by default. Serialize named tuple columns as JSON objects. -Disabled by default. +Enabled by default. + +### input_format_json_named_tuples_as_objects {#input_format_json_named_tuples_as_objects} + +Parse named tuple columns as JSON objects. + +Enabled by default. + +### input_format_json_defaults_for_missing_elements_in_named_tuple {#input_format_json_defaults_for_missing_elements_in_named_tuple} + +Insert default values for missing elements in JSON object while parsing named tuple. +This setting works only when setting `input_format_json_named_tuples_as_objects` is enabled. + +Enabled by default. ### output_format_json_array_of_rows {#output_format_json_array_of_rows} diff --git a/src/Core/Settings.h b/src/Core/Settings.h index b8d46244b6c..f58bd7ebafb 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -773,6 +773,8 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(Bool, input_format_json_validate_types_from_metadata, true, "For JSON/JSONCompact/JSONColumnsWithMetadata input formats this controls whether format parser should check if data types from input metadata match data types of the corresponding columns from the table", 0) \ M(Bool, input_format_json_read_numbers_as_strings, false, "Allow to parse numbers as strings in JSON input formats", 0) \ M(Bool, input_format_json_read_objects_as_strings, true, "Allow to parse JSON objects as strings in JSON input formats", 0) \ + M(Bool, input_format_json_named_tuples_as_objects, true, "Deserialize named tuple columns as JSON objects", 0) \ + M(Bool, input_format_json_defaults_for_missing_elements_in_named_tuple, true, "Insert default value in named tuple element if it's missing in json object", 0) \ M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \ M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \ M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index d67f1b94d5d..534fcd42037 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -80,7 +80,8 @@ namespace SettingsChangesHistory /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972) static std::map settings_changes_history = { - {"23.1", {{"input_format_json_read_objects_as_strings", 0, 1, "Enable reading nested json objects as strings while object type is experimental"}}}, + {"23.1", {{"input_format_json_read_objects_as_strings", 0, 1, "Enable reading nested json objects as strings while object type is experimental"}, + {"input_format_json_defaults_for_missing_elements_in_named_tuple", false, true, "Allow missing elements in JSON objects while reading named tuples by default"}}}, {"22.12", {{"max_size_to_preallocate_for_aggregation", 10'000'000, 100'000'000, "This optimizes performance"}, {"query_plan_aggregation_in_order", 0, 1, "Enable some refactoring around query plan"}, {"format_binary_max_string_size", 0, 1_GiB, "Prevent allocating large amount of memory"}}}, diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp index 0ed2b034985..50d956584b9 100644 --- a/src/DataTypes/Serializations/SerializationTuple.cpp +++ b/src/DataTypes/Serializations/SerializationTuple.cpp @@ -16,6 +16,7 @@ namespace ErrorCodes { extern const int SIZES_OF_COLUMNS_IN_TUPLE_DOESNT_MATCH; extern const int NOT_FOUND_COLUMN_IN_BLOCK; + extern const int INCORRECT_DATA; } @@ -154,7 +155,7 @@ void SerializationTuple::deserializeText(IColumn & column, ReadBuffer & istr, co void SerializationTuple::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { - if (settings.json.named_tuples_as_objects + if (settings.json.write_named_tuples_as_objects && have_explicit_names) { writeChar('{', ostr); @@ -185,7 +186,7 @@ void SerializationTuple::serializeTextJSON(const IColumn & column, size_t row_nu void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - if (settings.json.named_tuples_as_objects + if (settings.json.read_named_tuples_as_objects && have_explicit_names) { skipWhitespaceIfAny(istr); @@ -194,12 +195,15 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr addElementSafe(elems.size(), column, [&] { - // Require all elements but in arbitrary order. - for (size_t i = 0; i < elems.size(); ++i) + std::vector seen_elements(elems.size(), 0); + size_t i = 0; + while (!istr.eof() && *istr.position() != '}') { + if (i == elems.size()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected number of elements in named tuple. Expected no more than {}", elems.size()); + if (i > 0) { - skipWhitespaceIfAny(istr); assertChar(',', istr); skipWhitespaceIfAny(istr); } @@ -211,12 +215,35 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr skipWhitespaceIfAny(istr); const size_t element_pos = getPositionByName(name); + seen_elements[element_pos] = 1; auto & element_column = extractElementColumn(column, element_pos); elems[element_pos]->deserializeTextJSON(element_column, istr, settings); + + skipWhitespaceIfAny(istr); + ++i; } - skipWhitespaceIfAny(istr); assertChar('}', istr); + + /// Check if we have missing elements. + if (i != elems.size()) + { + for (size_t element_pos = 0; element_pos != seen_elements.size(); ++element_pos) + { + if (seen_elements[element_pos]) + continue; + + if (!settings.json.defaults_for_missing_elements_in_named_tuple) + throw Exception( + ErrorCodes::INCORRECT_DATA, + "JSON object doesn't contain tuple element {}. If you want to insert defaults in case of missing elements, " + "enable setting input_format_json_defaults_for_missing_elements_in_named_tuple", + elems[element_pos]->getElementName()); + + auto & element_column = extractElementColumn(column, element_pos); + element_column.insertDefault(); + } + } }); } else diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index dc2f4ffcf55..ed2464f98e8 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -90,7 +90,9 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.input_allow_errors_ratio = settings.input_format_allow_errors_ratio; format_settings.json.array_of_rows = settings.output_format_json_array_of_rows; format_settings.json.escape_forward_slashes = settings.output_format_json_escape_forward_slashes; - format_settings.json.named_tuples_as_objects = settings.output_format_json_named_tuples_as_objects; + format_settings.json.write_named_tuples_as_objects = settings.output_format_json_named_tuples_as_objects; + format_settings.json.read_named_tuples_as_objects = settings.input_format_json_named_tuples_as_objects; + format_settings.json.defaults_for_missing_elements_in_named_tuple = settings.input_format_json_defaults_for_missing_elements_in_named_tuple; format_settings.json.quote_64bit_integers = settings.output_format_json_quote_64bit_integers; format_settings.json.quote_64bit_floats = settings.output_format_json_quote_64bit_floats; format_settings.json.quote_denormals = settings.output_format_json_quote_denormals; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index dcdd44edfeb..9d8680a009d 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -153,7 +153,9 @@ struct FormatSettings bool quote_denormals = true; bool quote_decimals = false; bool escape_forward_slashes = true; - bool named_tuples_as_objects = false; + bool read_named_tuples_as_objects = false; + bool write_named_tuples_as_objects = false; + bool defaults_for_missing_elements_in_named_tuple = false; bool serialize_as_strings = false; bool read_bools_as_numbers = true; bool read_numbers_as_strings = true; diff --git a/tests/queries/0_stateless/02532_json_missing_named_tuple_elements.reference b/tests/queries/0_stateless/02532_json_missing_named_tuple_elements.reference new file mode 100644 index 00000000000..d51dab695f2 --- /dev/null +++ b/tests/queries/0_stateless/02532_json_missing_named_tuple_elements.reference @@ -0,0 +1,6 @@ +(1,2,NULL) +(1,2,NULL) +(NULL,NULL,NULL) +1 +1 +1 diff --git a/tests/queries/0_stateless/02532_json_missing_named_tuple_elements.sh b/tests/queries/0_stateless/02532_json_missing_named_tuple_elements.sh new file mode 100755 index 00000000000..97847b08203 --- /dev/null +++ b/tests/queries/0_stateless/02532_json_missing_named_tuple_elements.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +echo '{"t" : {"a" : 1, "b" : 2}}' | $CLICKHOUSE_LOCAL --input-format=NDJSON --structure='t Tuple(a Nullable(UInt32), b Nullable(UInt32), c Nullable(UInt32))' -q "select * from table" + +echo '{"t" : { "a" : 1 , "b" : 2 } }' | $CLICKHOUSE_LOCAL --input-format=NDJSON --structure='t Tuple(a Nullable(UInt32), b Nullable(UInt32), c Nullable(UInt32))' -q "select * from table" + +echo '{"t" : {}}' | $CLICKHOUSE_LOCAL --input-format=NDJSON --structure='t Tuple(a Nullable(UInt32), b Nullable(UInt32), c Nullable(UInt32))' -q "select * from table" + +echo '{"t" : {"a" : 1, "b" : 2}}' | $CLICKHOUSE_LOCAL --input-format=NDJSON --structure='t Tuple(a Nullable(UInt32), b Nullable(UInt32), c Nullable(UInt32))' -q "select * from table" --input_format_json_defaults_for_missing_elements_in_named_tuple=0 2>&1 | grep -F "INCORRECT_DATA" -c + +echo '{"t" : {"a" : 1, "d" : 2}}' | $CLICKHOUSE_LOCAL --input-format=NDJSON --structure='t Tuple(a Nullable(UInt32), b Nullable(UInt32), c Nullable(UInt32))' -q "select * from table" 2>&1 | grep -F "NOT_FOUND_COLUMN_IN_BLOCK" -c + +echo '{"t" : {"a" : 1, "b" : 2, "c" : 3, "d" : 4}}' | $CLICKHOUSE_LOCAL --input-format=NDJSON --structure='t Tuple(a Nullable(UInt32), b Nullable(UInt32), c Nullable(UInt32))' -q "select * from table" 2>&1 | grep -F "INCORRECT_DATA" -c +