mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-26 09:32:01 +00:00
bug fix csv input field type mismatch
This commit is contained in:
parent
57c3941fa6
commit
ba57c84db3
@ -472,6 +472,8 @@ The CSV format supports the output of totals and extremes the same way as `TabSe
|
||||
- [input_format_csv_trim_whitespaces](/docs/en/operations/settings/settings-formats.md/#input_format_csv_trim_whitespaces) - trim spaces and tabs in non-quoted CSV strings. Default value - `true`.
|
||||
- [input_format_csv_allow_whitespace_or_tab_as_delimiter](/docs/en/operations/settings/settings-formats.md/# input_format_csv_allow_whitespace_or_tab_as_delimiter) - Allow to use whitespace or tab as field delimiter in CSV strings. Default value - `false`.
|
||||
- [input_format_csv_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_allow_variable_number_of_columns) - ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values. Default value - `false`.
|
||||
- [input_format_csv_allow_check_deserialize](/docs/en/operations/settings/settings-formats.md/# input_format_csv_allow_check_deserialize) - Allow to check whether the csv input field can be deserialized. Default value - `false`.
|
||||
- [input_format_csv_allow_set_default_if_deserialize_failed](/docs/en/operations/settings/settings-formats.md/# input_format_csv_allow_set_default_if_deserialize_failed) - Allow to set default value to column if the csv input field deserialize failed. Default value - `false`.
|
||||
|
||||
## CSVWithNames {#csvwithnames}
|
||||
|
||||
|
@ -969,6 +969,35 @@ Result
|
||||
a b
|
||||
```
|
||||
|
||||
### input_format_csv_allow_check_deserialize_result {#input_format_csv_allow_check_deserialize_result}
|
||||
|
||||
Allow to use whitespace or tab as field delimiter in CSV strings.
|
||||
|
||||
Default value: `false`.
|
||||
|
||||
### input_format_csv_allow_set_column_default_value_if_deserialize_failed {#input_format_csv_allow_set_column_default_value_if_deserialize_failed}
|
||||
|
||||
Allow to set default value to column if the csv input field deserialize failed
|
||||
|
||||
Default value: `false`.
|
||||
|
||||
**Examples**
|
||||
|
||||
Query
|
||||
|
||||
```bash
|
||||
echo 'a,b,c' > 1.txt
|
||||
./clickhouse local -q "create table test_tbl (x String, y UInt32, z Date) engine=MergeTree order by x"
|
||||
cat 1.txt | ./clickhouse local -q "INSERT INTO test_tbl SETTINGS input_format_csv_allow_check_deserialize_result=true, input_format_csv_allow_set_column_default_value_if_deserialize_failed=true FORMAT CSV"
|
||||
./clickhouse local -q "select * from test_tbl"
|
||||
```
|
||||
|
||||
Result
|
||||
|
||||
```text
|
||||
a 0 1971-01-01
|
||||
```
|
||||
|
||||
## Values format settings {#values-format-settings}
|
||||
|
||||
### input_format_values_interpret_expressions {#input_format_values_interpret_expressions}
|
||||
|
@ -872,6 +872,8 @@ class IColumn;
|
||||
M(Bool, input_format_csv_detect_header, true, "Automatically detect header with names and types in CSV format", 0) \
|
||||
M(Bool, input_format_csv_allow_whitespace_or_tab_as_delimiter, false, "Allow to use spaces and tabs(\\t) as field delimiter in the CSV strings", 0) \
|
||||
M(Bool, input_format_csv_trim_whitespaces, true, "Trims spaces and tabs (\\t) characters at the beginning and end in CSV strings", 0) \
|
||||
M(Bool, input_format_csv_allow_check_deserialize_result, false, "Allow to check the csv input field deserialize whether success or not.", 0) \
|
||||
M(Bool, input_format_csv_allow_set_column_default_value_if_deserialize_failed, false, "All to set column default value if the input field deserialize failed.", 0) \
|
||||
M(Bool, input_format_tsv_detect_header, true, "Automatically detect header with names and types in TSV format", 0) \
|
||||
M(Bool, input_format_custom_detect_header, true, "Automatically detect header with names and types in CustomSeparated format", 0) \
|
||||
M(Bool, input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format Parquet", 0) \
|
||||
|
@ -73,6 +73,8 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.csv.trim_whitespaces = settings.input_format_csv_trim_whitespaces;
|
||||
format_settings.csv.allow_whitespace_or_tab_as_delimiter = settings.input_format_csv_allow_whitespace_or_tab_as_delimiter;
|
||||
format_settings.csv.allow_variable_number_of_columns = settings.input_format_csv_allow_variable_number_of_columns;
|
||||
format_settings.csv.allow_check_deserialize = settings.input_format_csv_allow_check_deserialize;
|
||||
format_settings.csv.allow_set_default_if_deserialize_failed = settings.input_format_csv_allow_set_default_if_deserialize_failed;
|
||||
format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
|
||||
format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
|
||||
format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter;
|
||||
|
@ -141,6 +141,8 @@ struct FormatSettings
|
||||
bool trim_whitespaces = true;
|
||||
bool allow_whitespace_or_tab_as_delimiter = false;
|
||||
bool allow_variable_number_of_columns = false;
|
||||
bool allow_check_deserialize = false;
|
||||
bool allow_set_default_if_deserialize_failed=false;
|
||||
} csv;
|
||||
|
||||
struct HiveText
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include <DataTypes/Serializations/SerializationNullable.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <DataTypes/DataTypeNullable.h>
|
||||
#include <Common/logger_useful.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
@ -315,14 +316,48 @@ bool CSVFormatReader::readField(
|
||||
return false;
|
||||
}
|
||||
|
||||
if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type))
|
||||
BufferBase::Position pos_start = buf->position();
|
||||
size_t col_size = column.size();
|
||||
try
|
||||
{
|
||||
/// If value is null but type is not nullable then use default value instead.
|
||||
return SerializationNullable::deserializeTextCSVImpl(column, *buf, format_settings, serialization);
|
||||
if (format_settings.csv.allow_check_deserialize_result)
|
||||
{
|
||||
std::string field;
|
||||
readCSVField(field, *buf, format_settings.csv);
|
||||
ReadBufferFromMemory tmp(field);
|
||||
if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type))
|
||||
SerializationNullable::deserializeTextCSVImpl(column, tmp, format_settings, serialization);
|
||||
else
|
||||
serialization->deserializeTextCSV(column, tmp, format_settings);
|
||||
if (column.size() == col_size + 1 && field.size() > 0 && tmp.position() != tmp.buffer().end())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Text CSV deserialize field bytes logical error.");
|
||||
}
|
||||
else
|
||||
{
|
||||
if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type))
|
||||
{
|
||||
/// If value is null but type is not nullable then use default value instead.
|
||||
return SerializationNullable::deserializeTextCSVImpl(column, *buf, format_settings, serialization);
|
||||
}
|
||||
/// Read the column normally.
|
||||
serialization->deserializeTextCSV(column, *buf, format_settings);
|
||||
}
|
||||
}
|
||||
catch (Exception & e)
|
||||
{
|
||||
LOG_DEBUG(&Poco::Logger::get("CSVRowInputFormat"), "Failed to deserialize CSV column, exception message:{}", e.what());
|
||||
if (format_settings.csv.allow_set_column_default_value_if_deserialize_failed)
|
||||
{
|
||||
// Reset the column and buffer position, then skip the field and set column default value.
|
||||
if (column.size() == col_size + 1)
|
||||
column.popBack(1);
|
||||
buf->position() = pos_start;
|
||||
skipField();
|
||||
column.insertDefault();
|
||||
}
|
||||
else
|
||||
throw;
|
||||
}
|
||||
|
||||
/// Read the column normally.
|
||||
serialization->deserializeTextCSV(column, *buf, format_settings);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,3 @@
|
||||
a 1 2023-03-14
|
||||
a 0 1970-01-01
|
||||
c 1 1970-01-01
|
@ -0,0 +1,13 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# NOTE: this sh wrapper is required because of shell_config
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "drop table if exists test_tbl"
|
||||
$CLICKHOUSE_CLIENT -q "create table test_tbl (x String, y UInt32, z Date) engine=MergeTree order by x"
|
||||
cat $CURDIR/data_csv/csv_with_diff_field_types.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_tbl SETTINGS input_format_csv_allow_check_deserialize_result=true, input_format_csv_allow_set_column_default_value_if_deserialize_failed=true FORMAT CSV"
|
||||
$CLICKHOUSE_CLIENT -q "select * from test_tbl"
|
||||
$CLICKHOUSE_CLIENT -q "drop table test_tbl"
|
@ -0,0 +1,3 @@
|
||||
a,1,2023-03-14
|
||||
a,b,c
|
||||
c,1,a
|
|
Loading…
Reference in New Issue
Block a user