Allow to infer numbers from strings in CSV format

This commit is contained in:
avogar 2023-11-16 13:25:31 +00:00
parent 7bd6b42af2
commit c3a76fcc08
6 changed files with 15 additions and 2 deletions

View File

@ -942,6 +942,7 @@ class IColumn;
M(UInt64, input_format_max_rows_to_read_for_schema_inference, 25000, "The maximum rows of data to read for automatic schema inference", 0) \
M(UInt64, input_format_max_bytes_to_read_for_schema_inference, 32 * 1024 * 1024, "The maximum bytes of data to read for automatic schema inference", 0) \
M(Bool, input_format_csv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in CSV format", 0) \
M(Bool, input_format_csv_try_infer_numbers_from_strings, false, "Try to infer numbers from string fields while schema inference in CSV format", 0) \
M(Bool, input_format_tsv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in TSV format", 0) \
M(Bool, input_format_csv_detect_header, true, "Automatically detect header with names and types in CSV format", 0) \
M(Bool, input_format_csv_allow_whitespace_or_tab_as_delimiter, false, "Allow to use spaces and tabs(\\t) as field delimiter in the CSV strings", 0) \

View File

@ -303,8 +303,8 @@ DataTypePtr tryInferDataTypeByEscapingRule(const String & field, const FormatSet
/// Try to determine the type of value inside quotes
auto type = tryInferDataTypeForSingleField(data, format_settings);
/// If we couldn't infer any type or it's a number or tuple in quotes, we determine it as a string.
if (!type || isNumber(removeNullable(type)) || isTuple(type))
/// If we couldn't infer any type or it's tuple in quotes or it's a number and csv.try_infer_numbers_from_strings = 0, we determine it as a string.
if (!type || isTuple(type) || (isNumber(type) && !format_settings.csv.try_infer_numbers_from_strings))
return std::make_shared<DataTypeString>();
return type;

View File

@ -74,6 +74,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.csv.allow_whitespace_or_tab_as_delimiter = settings.input_format_csv_allow_whitespace_or_tab_as_delimiter;
format_settings.csv.allow_variable_number_of_columns = settings.input_format_csv_allow_variable_number_of_columns;
format_settings.csv.use_default_on_bad_values = settings.input_format_csv_use_default_on_bad_values;
format_settings.csv.try_infer_numbers_from_strings = settings.input_format_csv_try_infer_numbers_from_strings;
format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter;

View File

@ -164,6 +164,7 @@ struct FormatSettings
bool allow_whitespace_or_tab_as_delimiter = false;
bool allow_variable_number_of_columns = false;
bool use_default_on_bad_values = false;
bool try_infer_numbers_from_strings = true;
} csv;
struct HiveText

View File

@ -0,0 +1,6 @@
c1 Nullable(Int64)
c2 Nullable(Float64)
c3 Nullable(Bool)
c1 Nullable(String)
c2 Nullable(String)
c3 Nullable(String)

View File

@ -0,0 +1,4 @@
set input_format_csv_try_infer_numbers_from_strings=1;
desc format(CSV, '"42","42.42","True"');
desc format(CSV, '"42","42.42","True"\n"abc","def","ghk"');