Revert "Improve CSVInputFormat to check and set default value to column if deserialize failed"

This commit is contained in:
Kruglov Pavel 2023-07-19 14:44:59 +02:00 committed by GitHub
parent 63d0616a22
commit 7b3564f96a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 2 additions and 91 deletions

View File

@ -472,7 +472,6 @@ The CSV format supports the output of totals and extremes the same way as `TabSe
- [input_format_csv_trim_whitespaces](/docs/en/operations/settings/settings-formats.md/#input_format_csv_trim_whitespaces) - trim spaces and tabs in non-quoted CSV strings. Default value - `true`.
- [input_format_csv_allow_whitespace_or_tab_as_delimiter](/docs/en/operations/settings/settings-formats.md/# input_format_csv_allow_whitespace_or_tab_as_delimiter) - Allow to use whitespace or tab as field delimiter in CSV strings. Default value - `false`.
- [input_format_csv_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_allow_variable_number_of_columns) - ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values. Default value - `false`.
- [input_format_csv_use_default_on_bad_values](/docs/en/operations/settings/settings-formats.md/#input_format_csv_use_default_on_bad_values) - Allow to set default value to column when CSV field deserialization failed on bad value. Default value - `false`.
## CSVWithNames {#csvwithnames}

View File

@ -989,28 +989,6 @@ Result
a b
```
### input_format_csv_use_default_on_bad_values {#input_format_csv_use_default_on_bad_values}
Allow to set default value to column when CSV field deserialization failed on bad value
Default value: `false`.
**Examples**
Query
```bash
./clickhouse local -q "create table test_tbl (x String, y UInt32, z Date) engine=MergeTree order by x"
echo 'a,b,c' | ./clickhouse local -q "INSERT INTO test_tbl SETTINGS input_format_csv_use_default_on_bad_values=true FORMAT CSV"
./clickhouse local -q "select * from test_tbl"
```
Result
```text
a 0 1971-01-01
```
## Values format settings {#values-format-settings}
### input_format_values_interpret_expressions {#input_format_values_interpret_expressions}

View File

@ -874,7 +874,6 @@ class IColumn;
M(Bool, input_format_csv_detect_header, true, "Automatically detect header with names and types in CSV format", 0) \
M(Bool, input_format_csv_allow_whitespace_or_tab_as_delimiter, false, "Allow to use spaces and tabs(\\t) as field delimiter in the CSV strings", 0) \
M(Bool, input_format_csv_trim_whitespaces, true, "Trims spaces and tabs (\\t) characters at the beginning and end in CSV strings", 0) \
M(Bool, input_format_csv_use_default_on_bad_values, false, "Allow to set default value to column when CSV field deserialization failed on bad value", 0) \
M(Bool, input_format_tsv_detect_header, true, "Automatically detect header with names and types in TSV format", 0) \
M(Bool, input_format_custom_detect_header, true, "Automatically detect header with names and types in CustomSeparated format", 0) \
M(Bool, input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format Parquet", 0) \

View File

@ -73,7 +73,6 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.csv.trim_whitespaces = settings.input_format_csv_trim_whitespaces;
format_settings.csv.allow_whitespace_or_tab_as_delimiter = settings.input_format_csv_allow_whitespace_or_tab_as_delimiter;
format_settings.csv.allow_variable_number_of_columns = settings.input_format_csv_allow_variable_number_of_columns;
format_settings.csv.use_default_on_bad_values = settings.input_format_csv_use_default_on_bad_values;
format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter;

View File

@ -152,7 +152,6 @@ struct FormatSettings
bool trim_whitespaces = true;
bool allow_whitespace_or_tab_as_delimiter = false;
bool allow_variable_number_of_columns = false;
bool use_default_on_bad_values = false;
} csv;
struct HiveText

View File

@ -1,5 +1,4 @@
#include <IO/ReadHelpers.h>
#include <IO/ReadBufferFromString.h>
#include <IO/BufferWithOwnMemory.h>
#include <IO/Operators.h>
@ -316,54 +315,17 @@ bool CSVFormatReader::readField(
return false;
}
if (format_settings.csv.use_default_on_bad_values)
return readFieldOrDefault(column, type, serialization);
return readFieldImpl(*buf, column, type, serialization);
}
bool CSVFormatReader::readFieldImpl(ReadBuffer & istr, DB::IColumn & column, const DB::DataTypePtr & type, const DB::SerializationPtr & serialization)
{
if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type))
{
/// If value is null but type is not nullable then use default value instead.
return SerializationNullable::deserializeTextCSVImpl(column, istr, format_settings, serialization);
return SerializationNullable::deserializeTextCSVImpl(column, *buf, format_settings, serialization);
}
/// Read the column normally.
serialization->deserializeTextCSV(column, istr, format_settings);
serialization->deserializeTextCSV(column, *buf, format_settings);
return true;
}
bool CSVFormatReader::readFieldOrDefault(DB::IColumn & column, const DB::DataTypePtr & type, const DB::SerializationPtr & serialization)
{
String field;
readCSVField(field, *buf, format_settings.csv);
ReadBufferFromString tmp_buf(field);
bool is_bad_value = false;
bool res = false;
size_t col_size = column.size();
try
{
res = readFieldImpl(tmp_buf, column, type, serialization);
/// Check if we parsed the whole field successfully.
if (!field.empty() && !tmp_buf.eof())
is_bad_value = true;
}
catch (const Exception &)
{
is_bad_value = true;
}
if (!is_bad_value)
return res;
if (column.size() == col_size + 1)
column.popBack(1);
column.insertDefault();
return false;
}
void CSVFormatReader::skipPrefixBeforeHeader()
{
for (size_t i = 0; i != format_settings.csv.skip_first_lines; ++i)

View File

@ -89,8 +89,6 @@ public:
void setReadBuffer(ReadBuffer & in_) override;
FormatSettings::EscapingRule getEscapingRule() const override { return FormatSettings::EscapingRule::CSV; }
bool readFieldImpl(ReadBuffer & istr, DB::IColumn & column, const DB::DataTypePtr & type, const DB::SerializationPtr & serialization);
bool readFieldOrDefault(DB::IColumn & column, const DB::DataTypePtr & type, const DB::SerializationPtr & serialization);
protected:
PeekableReadBuffer * buf;

View File

@ -1,5 +0,0 @@
0 111 1970-01-01 false
1 abc 2023-03-14 true
2 c 1970-01-01 false
4 888 2023-03-14 false
5 bks 1970-01-01 false

View File

@ -1,13 +0,0 @@
#!/usr/bin/env bash
# NOTE: this sh wrapper is required because of shell_config
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
$CLICKHOUSE_CLIENT -q "drop table if exists test_tbl"
$CLICKHOUSE_CLIENT -q "create table test_tbl (a Int32, b String, c Date, e Boolean) engine=MergeTree order by a"
cat $CURDIR/data_csv/csv_with_bad_field_values.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_tbl SETTINGS input_format_csv_use_default_on_bad_values=true FORMAT CSV"
$CLICKHOUSE_CLIENT -q "select * from test_tbl"
$CLICKHOUSE_CLIENT -q "drop table test_tbl"

View File

@ -1,5 +0,0 @@
1,abc,2023-03-14,true
2,c,ab,false
bc,111,ab,ban
4,888,2023-03-14,false
5,bks,2023-03,abdd
1 1 abc 2023-03-14 true
2 2 c ab false
3 bc 111 ab ban
4 4 888 2023-03-14 false
5 5 bks 2023-03 abdd