mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 23:21:59 +00:00
Merge pull request #56859 from Avogar/csv-infer-numbers-from-strings
Allow to infer numbers from strings in CSV format
This commit is contained in:
commit
6567fb2c08
@ -478,6 +478,7 @@ The CSV format supports the output of totals and extremes the same way as `TabSe
|
|||||||
- [input_format_csv_allow_whitespace_or_tab_as_delimiter](/docs/en/operations/settings/settings-formats.md/# input_format_csv_allow_whitespace_or_tab_as_delimiter) - Allow to use whitespace or tab as field delimiter in CSV strings. Default value - `false`.
|
- [input_format_csv_allow_whitespace_or_tab_as_delimiter](/docs/en/operations/settings/settings-formats.md/# input_format_csv_allow_whitespace_or_tab_as_delimiter) - Allow to use whitespace or tab as field delimiter in CSV strings. Default value - `false`.
|
||||||
- [input_format_csv_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_allow_variable_number_of_columns) - allow variable number of columns in CSV format, ignore extra columns and use default values on missing columns. Default value - `false`.
|
- [input_format_csv_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_allow_variable_number_of_columns) - allow variable number of columns in CSV format, ignore extra columns and use default values on missing columns. Default value - `false`.
|
||||||
- [input_format_csv_use_default_on_bad_values](/docs/en/operations/settings/settings-formats.md/#input_format_csv_use_default_on_bad_values) - Allow to set default value to column when CSV field deserialization failed on bad value. Default value - `false`.
|
- [input_format_csv_use_default_on_bad_values](/docs/en/operations/settings/settings-formats.md/#input_format_csv_use_default_on_bad_values) - Allow to set default value to column when CSV field deserialization failed on bad value. Default value - `false`.
|
||||||
|
- [input_format_csv_try_infer_numbers_from_strings](/docs/en/operations/settings/settings-formats.md/#input_format_csv_try_infer_numbers_from_strings) - Try to infer numbers from string fields while schema inference. Default value - `false`.
|
||||||
|
|
||||||
## CSVWithNames {#csvwithnames}
|
## CSVWithNames {#csvwithnames}
|
||||||
|
|
||||||
|
@ -834,6 +834,27 @@ $$)
|
|||||||
└──────────────┴───────────────┘
|
└──────────────┴───────────────┘
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### CSV settings {#csv-settings}
|
||||||
|
|
||||||
|
##### input_format_csv_try_infer_numbers_from_strings
|
||||||
|
|
||||||
|
Enabling this setting allows inferring numbers from string values.
|
||||||
|
|
||||||
|
This setting is disabled by default.
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
|
||||||
|
```sql
|
||||||
|
SET input_format_json_try_infer_numbers_from_strings = 1;
|
||||||
|
DESC format(CSV, '"42","42.42"');
|
||||||
|
```
|
||||||
|
```reponse
|
||||||
|
┌─name─┬─type──────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
|
||||||
|
│ c1 │ Nullable(Int64) │ │ │ │ │ │
|
||||||
|
│ c2 │ Nullable(Float64) │ │ │ │ │ │
|
||||||
|
└──────┴───────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
### TSV/TSKV {#tsv-tskv}
|
### TSV/TSKV {#tsv-tskv}
|
||||||
|
|
||||||
In TSV/TSKV formats ClickHouse extracts column value from the row according to tabular delimiters and then parses extracted value using
|
In TSV/TSKV formats ClickHouse extracts column value from the row according to tabular delimiters and then parses extracted value using
|
||||||
|
@ -1130,6 +1130,13 @@ Result
|
|||||||
a 0 1971-01-01
|
a 0 1971-01-01
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## input_format_csv_try_infer_numbers_from_strings {#input_format_csv_try_infer_numbers_from_strings}
|
||||||
|
|
||||||
|
If enabled, during schema inference ClickHouse will try to infer numbers from string fields.
|
||||||
|
It can be useful if CSV data contains quoted UInt64 numbers.
|
||||||
|
|
||||||
|
Disabled by default.
|
||||||
|
|
||||||
## Values format settings {#values-format-settings}
|
## Values format settings {#values-format-settings}
|
||||||
|
|
||||||
### input_format_values_interpret_expressions {#input_format_values_interpret_expressions}
|
### input_format_values_interpret_expressions {#input_format_values_interpret_expressions}
|
||||||
|
@ -942,6 +942,7 @@ class IColumn;
|
|||||||
M(UInt64, input_format_max_rows_to_read_for_schema_inference, 25000, "The maximum rows of data to read for automatic schema inference", 0) \
|
M(UInt64, input_format_max_rows_to_read_for_schema_inference, 25000, "The maximum rows of data to read for automatic schema inference", 0) \
|
||||||
M(UInt64, input_format_max_bytes_to_read_for_schema_inference, 32 * 1024 * 1024, "The maximum bytes of data to read for automatic schema inference", 0) \
|
M(UInt64, input_format_max_bytes_to_read_for_schema_inference, 32 * 1024 * 1024, "The maximum bytes of data to read for automatic schema inference", 0) \
|
||||||
M(Bool, input_format_csv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in CSV format", 0) \
|
M(Bool, input_format_csv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in CSV format", 0) \
|
||||||
|
M(Bool, input_format_csv_try_infer_numbers_from_strings, false, "Try to infer numbers from string fields while schema inference in CSV format", 0) \
|
||||||
M(Bool, input_format_tsv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in TSV format", 0) \
|
M(Bool, input_format_tsv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in TSV format", 0) \
|
||||||
M(Bool, input_format_csv_detect_header, true, "Automatically detect header with names and types in CSV format", 0) \
|
M(Bool, input_format_csv_detect_header, true, "Automatically detect header with names and types in CSV format", 0) \
|
||||||
M(Bool, input_format_csv_allow_whitespace_or_tab_as_delimiter, false, "Allow to use spaces and tabs(\\t) as field delimiter in the CSV strings", 0) \
|
M(Bool, input_format_csv_allow_whitespace_or_tab_as_delimiter, false, "Allow to use spaces and tabs(\\t) as field delimiter in the CSV strings", 0) \
|
||||||
|
@ -303,8 +303,8 @@ DataTypePtr tryInferDataTypeByEscapingRule(const String & field, const FormatSet
|
|||||||
/// Try to determine the type of value inside quotes
|
/// Try to determine the type of value inside quotes
|
||||||
auto type = tryInferDataTypeForSingleField(data, format_settings);
|
auto type = tryInferDataTypeForSingleField(data, format_settings);
|
||||||
|
|
||||||
/// If we couldn't infer any type or it's a number or tuple in quotes, we determine it as a string.
|
/// If we couldn't infer any type or it's tuple in quotes or it's a number and csv.try_infer_numbers_from_strings = 0, we determine it as a string.
|
||||||
if (!type || isNumber(removeNullable(type)) || isTuple(type))
|
if (!type || isTuple(type) || (isNumber(type) && !format_settings.csv.try_infer_numbers_from_strings))
|
||||||
return std::make_shared<DataTypeString>();
|
return std::make_shared<DataTypeString>();
|
||||||
|
|
||||||
return type;
|
return type;
|
||||||
|
@ -74,6 +74,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
|||||||
format_settings.csv.allow_whitespace_or_tab_as_delimiter = settings.input_format_csv_allow_whitespace_or_tab_as_delimiter;
|
format_settings.csv.allow_whitespace_or_tab_as_delimiter = settings.input_format_csv_allow_whitespace_or_tab_as_delimiter;
|
||||||
format_settings.csv.allow_variable_number_of_columns = settings.input_format_csv_allow_variable_number_of_columns;
|
format_settings.csv.allow_variable_number_of_columns = settings.input_format_csv_allow_variable_number_of_columns;
|
||||||
format_settings.csv.use_default_on_bad_values = settings.input_format_csv_use_default_on_bad_values;
|
format_settings.csv.use_default_on_bad_values = settings.input_format_csv_use_default_on_bad_values;
|
||||||
|
format_settings.csv.try_infer_numbers_from_strings = settings.input_format_csv_try_infer_numbers_from_strings;
|
||||||
format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
|
format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
|
||||||
format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
|
format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
|
||||||
format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter;
|
format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter;
|
||||||
|
@ -164,6 +164,7 @@ struct FormatSettings
|
|||||||
bool allow_whitespace_or_tab_as_delimiter = false;
|
bool allow_whitespace_or_tab_as_delimiter = false;
|
||||||
bool allow_variable_number_of_columns = false;
|
bool allow_variable_number_of_columns = false;
|
||||||
bool use_default_on_bad_values = false;
|
bool use_default_on_bad_values = false;
|
||||||
|
bool try_infer_numbers_from_strings = true;
|
||||||
} csv;
|
} csv;
|
||||||
|
|
||||||
struct HiveText
|
struct HiveText
|
||||||
|
@ -0,0 +1,6 @@
|
|||||||
|
c1 Nullable(Int64)
|
||||||
|
c2 Nullable(Float64)
|
||||||
|
c3 Nullable(Bool)
|
||||||
|
c1 Nullable(String)
|
||||||
|
c2 Nullable(String)
|
||||||
|
c3 Nullable(String)
|
@ -0,0 +1,4 @@
|
|||||||
|
set input_format_csv_try_infer_numbers_from_strings=1;
|
||||||
|
desc format(CSV, '"42","42.42","True"');
|
||||||
|
desc format(CSV, '"42","42.42","True"\n"abc","def","ghk"');
|
||||||
|
|
Loading…
Reference in New Issue
Block a user