Merge pull request #56859 from Avogar/csv-infer-numbers-from-strings

Allow to infer numbers from strings in CSV format
This commit is contained in:
Kruglov Pavel 2023-12-12 17:09:02 +01:00 committed by GitHub
commit 6567fb2c08
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 44 additions and 2 deletions

View File

@ -478,6 +478,7 @@ The CSV format supports the output of totals and extremes the same way as `TabSe
- [input_format_csv_allow_whitespace_or_tab_as_delimiter](/docs/en/operations/settings/settings-formats.md/# input_format_csv_allow_whitespace_or_tab_as_delimiter) - Allow to use whitespace or tab as field delimiter in CSV strings. Default value - `false`.
- [input_format_csv_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_allow_variable_number_of_columns) - allow variable number of columns in CSV format, ignore extra columns and use default values on missing columns. Default value - `false`.
- [input_format_csv_use_default_on_bad_values](/docs/en/operations/settings/settings-formats.md/#input_format_csv_use_default_on_bad_values) - Allow to set default value to column when CSV field deserialization failed on bad value. Default value - `false`.
- [input_format_csv_try_infer_numbers_from_strings](/docs/en/operations/settings/settings-formats.md/#input_format_csv_try_infer_numbers_from_strings) - Try to infer numbers from string fields while schema inference. Default value - `false`.
## CSVWithNames {#csvwithnames}

View File

@ -834,6 +834,27 @@ $$)
└──────────────┴───────────────┘
```
#### CSV settings {#csv-settings}
##### input_format_csv_try_infer_numbers_from_strings
Enabling this setting allows inferring numbers from string values.
This setting is disabled by default.
**Example:**
```sql
SET input_format_json_try_infer_numbers_from_strings = 1;
DESC format(CSV, '"42","42.42"');
```
```reponse
┌─name─┬─type──────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ c1 │ Nullable(Int64) │ │ │ │ │ │
│ c2 │ Nullable(Float64) │ │ │ │ │ │
└──────┴───────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
```
### TSV/TSKV {#tsv-tskv}
In TSV/TSKV formats ClickHouse extracts column value from the row according to tabular delimiters and then parses extracted value using

View File

@ -1130,6 +1130,13 @@ Result
a 0 1971-01-01
```
## input_format_csv_try_infer_numbers_from_strings {#input_format_csv_try_infer_numbers_from_strings}
If enabled, during schema inference ClickHouse will try to infer numbers from string fields.
It can be useful if CSV data contains quoted UInt64 numbers.
Disabled by default.
## Values format settings {#values-format-settings}
### input_format_values_interpret_expressions {#input_format_values_interpret_expressions}

View File

@ -942,6 +942,7 @@ class IColumn;
M(UInt64, input_format_max_rows_to_read_for_schema_inference, 25000, "The maximum rows of data to read for automatic schema inference", 0) \
M(UInt64, input_format_max_bytes_to_read_for_schema_inference, 32 * 1024 * 1024, "The maximum bytes of data to read for automatic schema inference", 0) \
M(Bool, input_format_csv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in CSV format", 0) \
M(Bool, input_format_csv_try_infer_numbers_from_strings, false, "Try to infer numbers from string fields while schema inference in CSV format", 0) \
M(Bool, input_format_tsv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in TSV format", 0) \
M(Bool, input_format_csv_detect_header, true, "Automatically detect header with names and types in CSV format", 0) \
M(Bool, input_format_csv_allow_whitespace_or_tab_as_delimiter, false, "Allow to use spaces and tabs(\\t) as field delimiter in the CSV strings", 0) \

View File

@ -303,8 +303,8 @@ DataTypePtr tryInferDataTypeByEscapingRule(const String & field, const FormatSet
/// Try to determine the type of value inside quotes
auto type = tryInferDataTypeForSingleField(data, format_settings);
/// If we couldn't infer any type or it's a number or tuple in quotes, we determine it as a string.
if (!type || isNumber(removeNullable(type)) || isTuple(type))
/// If we couldn't infer any type or it's tuple in quotes or it's a number and csv.try_infer_numbers_from_strings = 0, we determine it as a string.
if (!type || isTuple(type) || (isNumber(type) && !format_settings.csv.try_infer_numbers_from_strings))
return std::make_shared<DataTypeString>();
return type;

View File

@ -74,6 +74,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.csv.allow_whitespace_or_tab_as_delimiter = settings.input_format_csv_allow_whitespace_or_tab_as_delimiter;
format_settings.csv.allow_variable_number_of_columns = settings.input_format_csv_allow_variable_number_of_columns;
format_settings.csv.use_default_on_bad_values = settings.input_format_csv_use_default_on_bad_values;
format_settings.csv.try_infer_numbers_from_strings = settings.input_format_csv_try_infer_numbers_from_strings;
format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter;

View File

@ -164,6 +164,7 @@ struct FormatSettings
bool allow_whitespace_or_tab_as_delimiter = false;
bool allow_variable_number_of_columns = false;
bool use_default_on_bad_values = false;
bool try_infer_numbers_from_strings = true;
} csv;
struct HiveText

View File

@ -0,0 +1,6 @@
c1 Nullable(Int64)
c2 Nullable(Float64)
c3 Nullable(Bool)
c1 Nullable(String)
c2 Nullable(String)
c3 Nullable(String)

View File

@ -0,0 +1,4 @@
set input_format_csv_try_infer_numbers_from_strings=1;
desc format(CSV, '"42","42.42","True"');
desc format(CSV, '"42","42.42","True"\n"abc","def","ghk"');