Merge pull request #58561 from Avogar/json-bool-as-string

Allow to read Bool values into String in JSON input formats
This commit is contained in:
Alexey Milovidov 2024-01-06 21:41:36 +01:00 committed by GitHub
commit a899f0a9ed
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 97 additions and 4 deletions

View File

@ -1262,6 +1262,7 @@ SELECT * FROM json_each_row_nested
- [input_format_import_nested_json](/docs/en/operations/settings/settings-formats.md/#input_format_import_nested_json) - map nested JSON data to nested tables (it works for JSONEachRow format). Default value - `false`.
- [input_format_json_read_bools_as_numbers](/docs/en/operations/settings/settings-formats.md/#input_format_json_read_bools_as_numbers) - allow to parse bools as numbers in JSON input formats. Default value - `true`.
- [input_format_json_read_bools_as_strings](/docs/en/operations/settings/settings-formats.md/#input_format_json_read_bools_as_strings) - allow to parse bools as strings in JSON input formats. Default value - `true`.
- [input_format_json_read_numbers_as_strings](/docs/en/operations/settings/settings-formats.md/#input_format_json_read_numbers_as_strings) - allow to parse numbers as strings in JSON input formats. Default value - `true`.
- [input_format_json_read_arrays_as_strings](/docs/en/operations/settings/settings-formats.md/#input_format_json_read_arrays_as_strings) - allow to parse JSON arrays as strings in JSON input formats. Default value - `true`.
- [input_format_json_read_objects_as_strings](/docs/en/operations/settings/settings-formats.md/#input_format_json_read_objects_as_strings) - allow to parse JSON objects as strings in JSON input formats. Default value - `true`.

View File

@ -614,6 +614,26 @@ DESC format(JSONEachRow, $$
└───────┴─────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
```
##### input_format_json_read_bools_as_strings
Enabling this setting allows reading Bool values as strings.
This setting is enabled by default.
**Example:**
```sql
SET input_format_json_read_bools_as_strings = 1;
DESC format(JSONEachRow, $$
{"value" : true}
{"value" : "Hello, World"}
$$)
```
```response
┌─name──┬─type─────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ value │ Nullable(String) │ │ │ │ │ │
└───────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
```
##### input_format_json_read_arrays_as_strings
Enabling this setting allows reading JSON array values as strings.

View File

@ -377,6 +377,12 @@ Allow parsing bools as numbers in JSON input formats.
Enabled by default.
## input_format_json_read_bools_as_strings {#input_format_json_read_bools_as_strings}
Allow parsing bools as strings in JSON input formats.
Enabled by default.
## input_format_json_read_numbers_as_strings {#input_format_json_read_numbers_as_strings}
Allow parsing numbers as strings in JSON input formats.

View File

@ -983,6 +983,7 @@ class IColumn;
M(SchemaInferenceMode, schema_inference_mode, "default", "Mode of schema inference. 'default' - assume that all files have the same schema and schema can be inferred from any file, 'union' - files can have different schemas and the resulting schema should be the a union of schemas of all files", 0) \
M(Bool, schema_inference_make_columns_nullable, true, "If set to true, all inferred types will be Nullable in schema inference for formats without information about nullability.", 0) \
M(Bool, input_format_json_read_bools_as_numbers, true, "Allow to parse bools as numbers in JSON input formats", 0) \
M(Bool, input_format_json_read_bools_as_strings, true, "Allow to parse bools as strings in JSON input formats", 0) \
M(Bool, input_format_json_try_infer_numbers_from_strings, false, "Try to infer numbers from string fields while schema inference", 0) \
M(Bool, input_format_json_validate_types_from_metadata, true, "For JSON/JSONCompact/JSONColumnsWithMetadata input formats this controls whether format parser should check if data types from input metadata match data types of the corresponding columns from the table", 0) \
M(Bool, input_format_json_read_numbers_as_strings, true, "Allow to parse numbers as strings in JSON input formats", 0) \

View File

@ -81,6 +81,7 @@ namespace SettingsChangesHistory
/// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972)
static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> settings_changes_history =
{
{"24.1", {{"input_format_json_read_bools_as_strings", false, true, "Allow to read bools as strings in JSON formats by default"}}},
{"23.12", {{"allow_suspicious_ttl_expressions", true, false, "It is a new setting, and in previous versions the behavior was equivalent to allowing."},
{"input_format_parquet_allow_missing_columns", false, true, "Allow missing columns in Parquet files by default"},
{"input_format_orc_allow_missing_columns", false, true, "Allow missing columns in ORC files by default"},

View File

@ -335,6 +335,22 @@ void SerializationString::deserializeTextJSON(IColumn & column, ReadBuffer & ist
{
read(column, [&](ColumnString::Chars & data) { readJSONArrayInto(data, istr); });
}
else if (settings.json.read_bools_as_strings && !istr.eof() && (*istr.position() == 't' || *istr.position() == 'f'))
{
String str_value;
if (*istr.position() == 't')
{
assertString("true", istr);
str_value = "true";
}
else if (*istr.position() == 'f')
{
assertString("false", istr);
str_value = "false";
}
read(column, [&](ColumnString::Chars & data) { data.insert(str_value.begin(), str_value.end()); });
}
else if (settings.json.read_numbers_as_strings && !istr.eof() && *istr.position() != '"')
{
String field;

View File

@ -450,10 +450,11 @@ String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, Fo
break;
case FormatSettings::EscapingRule::JSON:
result += fmt::format(
", try_infer_numbers_from_strings={}, read_bools_as_numbers={}, read_objects_as_strings={}, read_numbers_as_strings={}, "
", try_infer_numbers_from_strings={}, read_bools_as_numbers={}, read_bools_as_strings={}, read_objects_as_strings={}, read_numbers_as_strings={}, "
"read_arrays_as_strings={}, try_infer_objects_as_tuples={}, infer_incomplete_types_as_strings={}, try_infer_objects={}",
settings.json.try_infer_numbers_from_strings,
settings.json.read_bools_as_numbers,
settings.json.read_bools_as_strings,
settings.json.read_objects_as_strings,
settings.json.read_numbers_as_strings,
settings.json.read_arrays_as_strings,

View File

@ -111,6 +111,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.json.quote_denormals = settings.output_format_json_quote_denormals;
format_settings.json.quote_decimals = settings.output_format_json_quote_decimals;
format_settings.json.read_bools_as_numbers = settings.input_format_json_read_bools_as_numbers;
format_settings.json.read_bools_as_strings = settings.input_format_json_read_bools_as_strings;
format_settings.json.read_numbers_as_strings = settings.input_format_json_read_numbers_as_strings;
format_settings.json.read_objects_as_strings = settings.input_format_json_read_objects_as_strings;
format_settings.json.read_arrays_as_strings = settings.input_format_json_read_arrays_as_strings;

View File

@ -204,6 +204,7 @@ struct FormatSettings
bool ignore_unknown_keys_in_named_tuple = false;
bool serialize_as_strings = false;
bool read_bools_as_numbers = true;
bool read_bools_as_strings = true;
bool read_numbers_as_strings = true;
bool read_objects_as_strings = true;
bool read_arrays_as_strings = true;

View File

@ -377,6 +377,22 @@ namespace
type_indexes.erase(TypeIndex::UInt8);
}
/// If we have Bool and String types convert all numbers to String.
/// It's applied only when setting input_format_json_read_bools_as_strings is enabled.
void transformJSONBoolsAndStringsToString(DataTypes & data_types, TypeIndexesSet & type_indexes)
{
if (!type_indexes.contains(TypeIndex::String) || !type_indexes.contains(TypeIndex::UInt8))
return;
for (auto & type : data_types)
{
if (isBool(type))
type = std::make_shared<DataTypeString>();
}
type_indexes.erase(TypeIndex::UInt8);
}
/// If we have type Nothing/Nullable(Nothing) and some other non Nothing types,
/// convert all Nothing/Nullable(Nothing) types to the first non Nothing.
/// For example, when we have [Nothing, Array(Int64)] it will convert it to [Array(Int64), Array(Int64)]
@ -628,6 +644,10 @@ namespace
if (settings.json.read_bools_as_numbers)
transformBoolsAndNumbersToNumbers(data_types, type_indexes);
/// Convert Bool to String if needed.
if (settings.json.read_bools_as_strings)
transformJSONBoolsAndStringsToString(data_types, type_indexes);
if (settings.json.try_infer_objects_as_tuples)
mergeJSONPaths(data_types, type_indexes, settings, json_info);
};

View File

@ -1382,8 +1382,12 @@ void skipJSONField(ReadBuffer & buf, StringRef name_of_field)
}
else
{
throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol '{}' for key '{}'",
std::string(*buf.position(), 1), name_of_field.toString());
throw Exception(
ErrorCodes::INCORRECT_DATA,
"Cannot read JSON field here: '{}'. Unexpected symbol '{}'{}",
String(buf.position(), std::min(buf.available(), size_t(10))),
std::string(1, *buf.position()),
name_of_field.empty() ? "" : " for key " + name_of_field.toString());
}
}
@ -1753,7 +1757,7 @@ void readQuotedField(String & s, ReadBuffer & buf)
void readJSONField(String & s, ReadBuffer & buf)
{
s.clear();
auto parse_func = [](ReadBuffer & in) { skipJSONField(in, "json_field"); };
auto parse_func = [](ReadBuffer & in) { skipJSONField(in, ""); };
readParsedValueInto(s, buf, parse_func);
}

View File

@ -0,0 +1,12 @@
true
false
str
true
false
str
['true','false']
['false','true']
['str1','str2']
['true','false']
['false','true']
['str1','str2']

View File

@ -0,0 +1,9 @@
set input_format_json_read_bools_as_strings=1;
select * from format(JSONEachRow, 'x String', '{"x" : true}, {"x" : false}, {"x" : "str"}');
select * from format(JSONEachRow, '{"x" : true}, {"x" : false}, {"x" : "str"}');
select * from format(JSONEachRow, 'x String', '{"x" : tru}'); -- {serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED}
select * from format(JSONEachRow, 'x String', '{"x" : fals}'); -- {serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED}
select * from format(JSONEachRow, 'x String', '{"x" : atru}'); -- {serverError INCORRECT_DATA}
select * from format(JSONEachRow, 'x Array(String)', '{"x" : [true, false]}, {"x" : [false, true]}, {"x" : ["str1", "str2"]}');
select * from format(JSONEachRow, '{"x" : [true, false]}, {"x" : [false, true]}, {"x" : ["str1", "str2"]}');