bug fix csv input field type mismatch

This commit is contained in:
kevinyhzou 2023-07-03 11:40:11 +08:00
parent 57c3941fa6
commit ba57c84db3
9 changed files with 97 additions and 6 deletions

View File

@ -472,6 +472,8 @@ The CSV format supports the output of totals and extremes the same way as `TabSe
- [input_format_csv_trim_whitespaces](/docs/en/operations/settings/settings-formats.md/#input_format_csv_trim_whitespaces) - trim spaces and tabs in non-quoted CSV strings. Default value - `true`.
- [input_format_csv_allow_whitespace_or_tab_as_delimiter](/docs/en/operations/settings/settings-formats.md/# input_format_csv_allow_whitespace_or_tab_as_delimiter) - Allow to use whitespace or tab as field delimiter in CSV strings. Default value - `false`.
- [input_format_csv_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_allow_variable_number_of_columns) - ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values. Default value - `false`.
- [input_format_csv_allow_check_deserialize](/docs/en/operations/settings/settings-formats.md/# input_format_csv_allow_check_deserialize) - Allow to check whether the csv input field can be deserialized. Default value - `false`.
- [input_format_csv_allow_set_default_if_deserialize_failed](/docs/en/operations/settings/settings-formats.md/# input_format_csv_allow_set_default_if_deserialize_failed) - Allow to set default value to column if the csv input field deserialize failed. Default value - `false`.
## CSVWithNames {#csvwithnames}

View File

@ -969,6 +969,35 @@ Result
a b
```
### input_format_csv_allow_check_deserialize_result {#input_format_csv_allow_check_deserialize_result}
Allow to use whitespace or tab as field delimiter in CSV strings.
Default value: `false`.
### input_format_csv_allow_set_column_default_value_if_deserialize_failed {#input_format_csv_allow_set_column_default_value_if_deserialize_failed}
Allow to set default value to column if the csv input field deserialize failed
Default value: `false`.
**Examples**
Query
```bash
echo 'a,b,c' > 1.txt
./clickhouse local -q "create table test_tbl (x String, y UInt32, z Date) engine=MergeTree order by x"
cat 1.txt | ./clickhouse local -q "INSERT INTO test_tbl SETTINGS input_format_csv_allow_check_deserialize_result=true, input_format_csv_allow_set_column_default_value_if_deserialize_failed=true FORMAT CSV"
./clickhouse local -q "select * from test_tbl"
```
Result
```text
a 0 1971-01-01
```
## Values format settings {#values-format-settings}
### input_format_values_interpret_expressions {#input_format_values_interpret_expressions}

View File

@ -872,6 +872,8 @@ class IColumn;
M(Bool, input_format_csv_detect_header, true, "Automatically detect header with names and types in CSV format", 0) \
M(Bool, input_format_csv_allow_whitespace_or_tab_as_delimiter, false, "Allow to use spaces and tabs(\\t) as field delimiter in the CSV strings", 0) \
M(Bool, input_format_csv_trim_whitespaces, true, "Trims spaces and tabs (\\t) characters at the beginning and end in CSV strings", 0) \
M(Bool, input_format_csv_allow_check_deserialize_result, false, "Allow to check the csv input field deserialize whether success or not.", 0) \
M(Bool, input_format_csv_allow_set_column_default_value_if_deserialize_failed, false, "All to set column default value if the input field deserialize failed.", 0) \
M(Bool, input_format_tsv_detect_header, true, "Automatically detect header with names and types in TSV format", 0) \
M(Bool, input_format_custom_detect_header, true, "Automatically detect header with names and types in CustomSeparated format", 0) \
M(Bool, input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format Parquet", 0) \

View File

@ -73,6 +73,8 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.csv.trim_whitespaces = settings.input_format_csv_trim_whitespaces;
format_settings.csv.allow_whitespace_or_tab_as_delimiter = settings.input_format_csv_allow_whitespace_or_tab_as_delimiter;
format_settings.csv.allow_variable_number_of_columns = settings.input_format_csv_allow_variable_number_of_columns;
format_settings.csv.allow_check_deserialize = settings.input_format_csv_allow_check_deserialize;
format_settings.csv.allow_set_default_if_deserialize_failed = settings.input_format_csv_allow_set_default_if_deserialize_failed;
format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter;

View File

@ -141,6 +141,8 @@ struct FormatSettings
bool trim_whitespaces = true;
bool allow_whitespace_or_tab_as_delimiter = false;
bool allow_variable_number_of_columns = false;
bool allow_check_deserialize = false;
bool allow_set_default_if_deserialize_failed=false;
} csv;
struct HiveText

View File

@ -11,6 +11,7 @@
#include <DataTypes/Serializations/SerializationNullable.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeNullable.h>
#include <Common/logger_useful.h>
namespace DB
@ -315,14 +316,48 @@ bool CSVFormatReader::readField(
return false;
}
if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type))
BufferBase::Position pos_start = buf->position();
size_t col_size = column.size();
try
{
/// If value is null but type is not nullable then use default value instead.
return SerializationNullable::deserializeTextCSVImpl(column, *buf, format_settings, serialization);
if (format_settings.csv.allow_check_deserialize_result)
{
std::string field;
readCSVField(field, *buf, format_settings.csv);
ReadBufferFromMemory tmp(field);
if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type))
SerializationNullable::deserializeTextCSVImpl(column, tmp, format_settings, serialization);
else
serialization->deserializeTextCSV(column, tmp, format_settings);
if (column.size() == col_size + 1 && field.size() > 0 && tmp.position() != tmp.buffer().end())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Text CSV deserialize field bytes logical error.");
}
else
{
if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type))
{
/// If value is null but type is not nullable then use default value instead.
return SerializationNullable::deserializeTextCSVImpl(column, *buf, format_settings, serialization);
}
/// Read the column normally.
serialization->deserializeTextCSV(column, *buf, format_settings);
}
}
catch (Exception & e)
{
LOG_DEBUG(&Poco::Logger::get("CSVRowInputFormat"), "Failed to deserialize CSV column, exception message:{}", e.what());
if (format_settings.csv.allow_set_column_default_value_if_deserialize_failed)
{
// Reset the column and buffer position, then skip the field and set column default value.
if (column.size() == col_size + 1)
column.popBack(1);
buf->position() = pos_start;
skipField();
column.insertDefault();
}
else
throw;
}
/// Read the column normally.
serialization->deserializeTextCSV(column, *buf, format_settings);
return true;
}

View File

@ -0,0 +1,3 @@
a 1 2023-03-14
a 0 1970-01-01
c 1 1970-01-01

View File

@ -0,0 +1,13 @@
#!/usr/bin/env bash
# NOTE: this sh wrapper is required because of shell_config
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
$CLICKHOUSE_CLIENT -q "drop table if exists test_tbl"
$CLICKHOUSE_CLIENT -q "create table test_tbl (x String, y UInt32, z Date) engine=MergeTree order by x"
cat $CURDIR/data_csv/csv_with_diff_field_types.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_tbl SETTINGS input_format_csv_allow_check_deserialize_result=true, input_format_csv_allow_set_column_default_value_if_deserialize_failed=true FORMAT CSV"
$CLICKHOUSE_CLIENT -q "select * from test_tbl"
$CLICKHOUSE_CLIENT -q "drop table test_tbl"

View File

@ -0,0 +1,3 @@
a,1,2023-03-14
a,b,c
c,1,a
1 a 1 2023-03-14
2 a b c
3 c 1 a