mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-10 01:25:21 +00:00
Revert "Improve CSVInputFormat to check and set default value to column if deserialize failed"
This commit is contained in:
parent
63d0616a22
commit
7b3564f96a
@ -472,7 +472,6 @@ The CSV format supports the output of totals and extremes the same way as `TabSe
|
||||
- [input_format_csv_trim_whitespaces](/docs/en/operations/settings/settings-formats.md/#input_format_csv_trim_whitespaces) - trim spaces and tabs in non-quoted CSV strings. Default value - `true`.
|
||||
- [input_format_csv_allow_whitespace_or_tab_as_delimiter](/docs/en/operations/settings/settings-formats.md/# input_format_csv_allow_whitespace_or_tab_as_delimiter) - Allow to use whitespace or tab as field delimiter in CSV strings. Default value - `false`.
|
||||
- [input_format_csv_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_allow_variable_number_of_columns) - ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values. Default value - `false`.
|
||||
- [input_format_csv_use_default_on_bad_values](/docs/en/operations/settings/settings-formats.md/#input_format_csv_use_default_on_bad_values) - Allow to set default value to column when CSV field deserialization failed on bad value. Default value - `false`.
|
||||
|
||||
## CSVWithNames {#csvwithnames}
|
||||
|
||||
|
@ -989,28 +989,6 @@ Result
|
||||
a b
|
||||
```
|
||||
|
||||
### input_format_csv_use_default_on_bad_values {#input_format_csv_use_default_on_bad_values}
|
||||
|
||||
Allow to set default value to column when CSV field deserialization failed on bad value
|
||||
|
||||
Default value: `false`.
|
||||
|
||||
**Examples**
|
||||
|
||||
Query
|
||||
|
||||
```bash
|
||||
./clickhouse local -q "create table test_tbl (x String, y UInt32, z Date) engine=MergeTree order by x"
|
||||
echo 'a,b,c' | ./clickhouse local -q "INSERT INTO test_tbl SETTINGS input_format_csv_use_default_on_bad_values=true FORMAT CSV"
|
||||
./clickhouse local -q "select * from test_tbl"
|
||||
```
|
||||
|
||||
Result
|
||||
|
||||
```text
|
||||
a 0 1971-01-01
|
||||
```
|
||||
|
||||
## Values format settings {#values-format-settings}
|
||||
|
||||
### input_format_values_interpret_expressions {#input_format_values_interpret_expressions}
|
||||
|
@ -874,7 +874,6 @@ class IColumn;
|
||||
M(Bool, input_format_csv_detect_header, true, "Automatically detect header with names and types in CSV format", 0) \
|
||||
M(Bool, input_format_csv_allow_whitespace_or_tab_as_delimiter, false, "Allow to use spaces and tabs(\\t) as field delimiter in the CSV strings", 0) \
|
||||
M(Bool, input_format_csv_trim_whitespaces, true, "Trims spaces and tabs (\\t) characters at the beginning and end in CSV strings", 0) \
|
||||
M(Bool, input_format_csv_use_default_on_bad_values, false, "Allow to set default value to column when CSV field deserialization failed on bad value", 0) \
|
||||
M(Bool, input_format_tsv_detect_header, true, "Automatically detect header with names and types in TSV format", 0) \
|
||||
M(Bool, input_format_custom_detect_header, true, "Automatically detect header with names and types in CustomSeparated format", 0) \
|
||||
M(Bool, input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format Parquet", 0) \
|
||||
|
@ -73,7 +73,6 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.csv.trim_whitespaces = settings.input_format_csv_trim_whitespaces;
|
||||
format_settings.csv.allow_whitespace_or_tab_as_delimiter = settings.input_format_csv_allow_whitespace_or_tab_as_delimiter;
|
||||
format_settings.csv.allow_variable_number_of_columns = settings.input_format_csv_allow_variable_number_of_columns;
|
||||
format_settings.csv.use_default_on_bad_values = settings.input_format_csv_use_default_on_bad_values;
|
||||
format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
|
||||
format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
|
||||
format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter;
|
||||
|
@ -152,7 +152,6 @@ struct FormatSettings
|
||||
bool trim_whitespaces = true;
|
||||
bool allow_whitespace_or_tab_as_delimiter = false;
|
||||
bool allow_variable_number_of_columns = false;
|
||||
bool use_default_on_bad_values = false;
|
||||
} csv;
|
||||
|
||||
struct HiveText
|
||||
|
@ -1,5 +1,4 @@
|
||||
#include <IO/ReadHelpers.h>
|
||||
#include <IO/ReadBufferFromString.h>
|
||||
#include <IO/BufferWithOwnMemory.h>
|
||||
#include <IO/Operators.h>
|
||||
|
||||
@ -316,54 +315,17 @@ bool CSVFormatReader::readField(
|
||||
return false;
|
||||
}
|
||||
|
||||
if (format_settings.csv.use_default_on_bad_values)
|
||||
return readFieldOrDefault(column, type, serialization);
|
||||
return readFieldImpl(*buf, column, type, serialization);
|
||||
}
|
||||
|
||||
bool CSVFormatReader::readFieldImpl(ReadBuffer & istr, DB::IColumn & column, const DB::DataTypePtr & type, const DB::SerializationPtr & serialization)
|
||||
{
|
||||
if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type))
|
||||
{
|
||||
/// If value is null but type is not nullable then use default value instead.
|
||||
return SerializationNullable::deserializeTextCSVImpl(column, istr, format_settings, serialization);
|
||||
return SerializationNullable::deserializeTextCSVImpl(column, *buf, format_settings, serialization);
|
||||
}
|
||||
|
||||
/// Read the column normally.
|
||||
serialization->deserializeTextCSV(column, istr, format_settings);
|
||||
serialization->deserializeTextCSV(column, *buf, format_settings);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool CSVFormatReader::readFieldOrDefault(DB::IColumn & column, const DB::DataTypePtr & type, const DB::SerializationPtr & serialization)
|
||||
{
|
||||
String field;
|
||||
readCSVField(field, *buf, format_settings.csv);
|
||||
ReadBufferFromString tmp_buf(field);
|
||||
bool is_bad_value = false;
|
||||
bool res = false;
|
||||
|
||||
size_t col_size = column.size();
|
||||
try
|
||||
{
|
||||
res = readFieldImpl(tmp_buf, column, type, serialization);
|
||||
/// Check if we parsed the whole field successfully.
|
||||
if (!field.empty() && !tmp_buf.eof())
|
||||
is_bad_value = true;
|
||||
}
|
||||
catch (const Exception &)
|
||||
{
|
||||
is_bad_value = true;
|
||||
}
|
||||
|
||||
if (!is_bad_value)
|
||||
return res;
|
||||
|
||||
if (column.size() == col_size + 1)
|
||||
column.popBack(1);
|
||||
column.insertDefault();
|
||||
return false;
|
||||
}
|
||||
|
||||
void CSVFormatReader::skipPrefixBeforeHeader()
|
||||
{
|
||||
for (size_t i = 0; i != format_settings.csv.skip_first_lines; ++i)
|
||||
|
@ -89,8 +89,6 @@ public:
|
||||
void setReadBuffer(ReadBuffer & in_) override;
|
||||
|
||||
FormatSettings::EscapingRule getEscapingRule() const override { return FormatSettings::EscapingRule::CSV; }
|
||||
bool readFieldImpl(ReadBuffer & istr, DB::IColumn & column, const DB::DataTypePtr & type, const DB::SerializationPtr & serialization);
|
||||
bool readFieldOrDefault(DB::IColumn & column, const DB::DataTypePtr & type, const DB::SerializationPtr & serialization);
|
||||
|
||||
protected:
|
||||
PeekableReadBuffer * buf;
|
||||
|
@ -1,5 +0,0 @@
|
||||
0 111 1970-01-01 false
|
||||
1 abc 2023-03-14 true
|
||||
2 c 1970-01-01 false
|
||||
4 888 2023-03-14 false
|
||||
5 bks 1970-01-01 false
|
@ -1,13 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# NOTE: this sh wrapper is required because of shell_config
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "drop table if exists test_tbl"
|
||||
$CLICKHOUSE_CLIENT -q "create table test_tbl (a Int32, b String, c Date, e Boolean) engine=MergeTree order by a"
|
||||
cat $CURDIR/data_csv/csv_with_bad_field_values.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_tbl SETTINGS input_format_csv_use_default_on_bad_values=true FORMAT CSV"
|
||||
$CLICKHOUSE_CLIENT -q "select * from test_tbl"
|
||||
$CLICKHOUSE_CLIENT -q "drop table test_tbl"
|
@ -1,5 +0,0 @@
|
||||
1,abc,2023-03-14,true
|
||||
2,c,ab,false
|
||||
bc,111,ab,ban
|
||||
4,888,2023-03-14,false
|
||||
5,bks,2023-03,abdd
|
|
Loading…
Reference in New Issue
Block a user