From e2d7c543eb00195ae7ae6bcfb858d165e8bcbe8e Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 28 Sep 2023 13:45:44 +0000 Subject: [PATCH] Don't try to infer numbers from strings in JSON formats by default to avoid parsing errors, add docs for setting input_format_json_try_infer_numbers_from_strings --- docs/en/interfaces/formats.md | 1 + docs/en/interfaces/schema-inference.md | 2 +- docs/en/operations/settings/settings-formats.md | 7 +++++++ src/Core/Settings.h | 2 +- src/Core/SettingsChangesHistory.h | 1 + 5 files changed, 11 insertions(+), 2 deletions(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 4591461f773..e98f19b2a65 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -1263,6 +1263,7 @@ SELECT * FROM json_each_row_nested - [input_format_json_read_arrays_as_strings](/docs/en/operations/settings/settings-formats.md/#input_format_json_read_arrays_as_strings) - allow to parse JSON arrays as strings in JSON input formats. Default value - `true`. - [input_format_json_read_objects_as_strings](/docs/en/operations/settings/settings-formats.md/#input_format_json_read_objects_as_strings) - allow to parse JSON objects as strings in JSON input formats. Default value - `true`. - [input_format_json_named_tuples_as_objects](/docs/en/operations/settings/settings-formats.md/#input_format_json_named_tuples_as_objects) - parse named tuple columns as JSON objects. Default value - `true`. +- [input_format_json_try_infer_numbers_from_strings](/docs/en/operations/settings/settings-formats.md/#input_format_json_try_infer_numbers_from_strings) - Try to infer numbers from string fields while schema inference. Default value - `false`. - [input_format_json_try_infer_named_tuples_from_objects](/docs/en/operations/settings/settings-formats.md/#input_format_json_try_infer_named_tuples_from_objects) - try to infer named tuple from JSON objects during schema inference. Default value - `true`. - [input_format_json_infer_incomplete_types_as_strings](/docs/en/operations/settings/settings-formats.md/#input_format_json_infer_incomplete_types_as_strings) - use type String for keys that contains only Nulls or empty objects/arrays during schema inference in JSON input formats. Default value - `true`. - [input_format_json_defaults_for_missing_elements_in_named_tuple](/docs/en/operations/settings/settings-formats.md/#input_format_json_defaults_for_missing_elements_in_named_tuple) - insert default values for missing elements in JSON object while parsing named tuple. Default value - `true`. diff --git a/docs/en/interfaces/schema-inference.md b/docs/en/interfaces/schema-inference.md index 42b2c6b882b..0aadb09730a 100644 --- a/docs/en/interfaces/schema-inference.md +++ b/docs/en/interfaces/schema-inference.md @@ -497,7 +497,7 @@ most likely this column contains only Nulls or empty Arrays/Maps. Enabling this setting allows inferring numbers from string values. -This setting is enabled by default. +This setting is disabled by default. **Example:** diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index b8fbb1b8039..bb59402079e 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -383,6 +383,13 @@ Allow parsing numbers as strings in JSON input formats. Enabled by default. +## input_format_json_try_infer_numbers_from_strings {#input_format_json_try_infer_numbers_from_strings} + +If enabled, during schema inference ClickHouse will try to infer numbers from string fields. +It can be useful if JSON data contains quoted UInt64 numbers. + +Disabled by default. + ## input_format_json_read_objects_as_strings {#input_format_json_read_objects_as_strings} Allow parsing JSON objects as strings in JSON input formats. diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 10134a3a545..2df702559f1 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -923,7 +923,7 @@ class IColumn; M(String, schema_inference_hints, "", "The list of column names and types to use in schema inference for formats without column names. The format: 'column_name1 column_type1, column_name2 column_type2, ...'", 0) \ M(Bool, schema_inference_make_columns_nullable, true, "If set to true, all inferred types will be Nullable in schema inference for formats without information about nullability.", 0) \ M(Bool, input_format_json_read_bools_as_numbers, true, "Allow to parse bools as numbers in JSON input formats", 0) \ - M(Bool, input_format_json_try_infer_numbers_from_strings, true, "Try to infer numbers from string fields while schema inference", 0) \ + M(Bool, input_format_json_try_infer_numbers_from_strings, false, "Try to infer numbers from string fields while schema inference", 0) \ M(Bool, input_format_json_validate_types_from_metadata, true, "For JSON/JSONCompact/JSONColumnsWithMetadata input formats this controls whether format parser should check if data types from input metadata match data types of the corresponding columns from the table", 0) \ M(Bool, input_format_json_read_numbers_as_strings, true, "Allow to parse numbers as strings in JSON input formats", 0) \ M(Bool, input_format_json_read_objects_as_strings, true, "Allow to parse JSON objects as strings in JSON input formats", 0) \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index f400ca2d2fa..38039839e1e 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -85,6 +85,7 @@ static std::map sett {"input_format_json_read_numbers_as_strings", false, true, "Allow to read numbers as strings in JSON formats by default"}, {"input_format_json_read_arrays_as_strings", false, true, "Allow to read arrays as strings in JSON formats by default"}, {"input_format_json_infer_incomplete_types_as_strings", false, true, "Allow to infer incomplete types as Strings in JSON formats by default"}, + {"input_format_json_try_infer_numbers_from_strings", true, false, "Don't infer numbers from strings in JSON formats by default to prevent possible parsing errors"}, {"http_write_exception_in_output_format", false, true, "Output valid JSON/XML on exception in HTTP streaming."}}}, {"23.8", {{"rewrite_count_distinct_if_with_count_distinct_implementation", false, true, "Rewrite countDistinctIf with count_distinct_implementation configuration"}}}, {"23.7", {{"function_sleep_max_microseconds_per_block", 0, 3000000, "In previous versions, the maximum sleep time of 3 seconds was applied only for `sleep`, but not for `sleepEachRow` function. In the new version, we introduce this setting. If you set compatibility with the previous versions, we will disable the limit altogether."}}},