mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-10 09:32:06 +00:00
Add docs/ add more cases in test
This commit is contained in:
parent
ed318d1035
commit
a91fc3ddb3
@ -470,6 +470,7 @@ The CSV format supports the output of totals and extremes the same way as `TabSe
|
||||
- [input_format_csv_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_csv_detect_header) - automatically detect header with names and types in CSV format. Default value - `true`.
|
||||
- [input_format_csv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`.
|
||||
- [input_format_csv_trim_whitespaces](/docs/en/operations/settings/settings-formats.md/#input_format_csv_trim_whitespaces) - trim spaces and tabs in non-quoted CSV strings. Default value - `true`.
|
||||
- [input_format_csv_ignore_extra_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_ignore_extra_columns) - ignore extra colums in CSV input. Default value - `false`.
|
||||
|
||||
## CSVWithNames {#csvwithnames}
|
||||
|
||||
|
@ -931,6 +931,11 @@ Result
|
||||
```text
|
||||
" string "
|
||||
```
|
||||
### input_format_csv_ignore_extra_columns {#input_format_csv_ignore_extra_columns}
|
||||
|
||||
Ignore extra colums in CSV input.
|
||||
|
||||
Disabled by default.
|
||||
|
||||
## Values format settings {#values-format-settings}
|
||||
|
||||
|
@ -401,8 +401,8 @@ $ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FOR
|
||||
- [output_format_csv_crlf_end_of_line](../operations/settings/settings.md#output_format_csv_crlf_end_of_line) - если установлено значение true, конец строки в формате вывода CSV будет `\r\n` вместо `\n`. Значение по умолчанию - `false`.
|
||||
- [input_format_csv_skip_first_lines](../operations/settings/settings.md#input_format_csv_skip_first_lines) - пропустить указанное количество строк в начале данных. Значение по умолчанию - `0`.
|
||||
- [input_format_csv_detect_header](../operations/settings/settings.md#input_format_csv_detect_header) - обнаружить заголовок с именами и типами в формате CSV. Значение по умолчанию - `true`.
|
||||
- [input_format_csv_trim_whitespaces](../operations/settings/settings.md#input_format_csv_trim_whitespaces) - удалить пробелы и символы табуляции из строк без кавычек.
|
||||
Значение по умолчанию - `true`.
|
||||
- [input_format_csv_trim_whitespaces](../operations/settings/settings.md#input_format_csv_trim_whitespaces) - удалить пробелы и символы табуляции из строк без кавычек. Значение по умолчанию - `true`.
|
||||
- [input_format_csv_ignore_extra_columns](../operations/settings/settings.md/#input_format_csv_ignore_extra_columns) - игнорировать дополнительные столбцы. Значение по умолчанию - `false`.
|
||||
|
||||
## CSVWithNames {#csvwithnames}
|
||||
|
||||
|
@ -1727,6 +1727,12 @@ echo ' string ' | ./clickhouse local -q "select * from table FORMAT CSV" --in
|
||||
" string "
|
||||
```
|
||||
|
||||
## input_format_csv_ignore_extra_columns {#input_format_csv_ignore_extra_columns}
|
||||
|
||||
Игнорировать дополнительные столбцы.
|
||||
|
||||
Выключено по умолчанию.
|
||||
|
||||
## output_format_tsv_crlf_end_of_line {#settings-output-format-tsv-crlf-end-of-line}
|
||||
|
||||
Использовать в качестве разделителя строк для TSV формата CRLF (DOC/Windows стиль) вместо LF (Unix стиль).
|
||||
|
@ -835,7 +835,6 @@ class IColumn;
|
||||
M(Bool, input_format_import_nested_json, false, "Map nested JSON data to nested tables (it works for JSONEachRow format).", 0) \
|
||||
M(Bool, input_format_defaults_for_omitted_fields, true, "For input data calculate default expressions for omitted fields (it works for JSONEachRow, -WithNames, -WithNamesAndTypes formats).", IMPORTANT) \
|
||||
M(Bool, input_format_csv_empty_as_default, true, "Treat empty fields in CSV input as default values.", 0) \
|
||||
M(Bool, input_format_csv_ignore_extra_columns, false, "", 0) \
|
||||
M(Bool, input_format_tsv_empty_as_default, false, "Treat empty fields in TSV input as default values.", 0) \
|
||||
M(Bool, input_format_tsv_enum_as_number, false, "Treat inserted enum values in TSV formats as enum indices.", 0) \
|
||||
M(Bool, input_format_null_as_default, true, "Initialize null fields with default values if the data type of this field is not nullable and it is supported by the input format", 0) \
|
||||
@ -1001,6 +1000,7 @@ class IColumn;
|
||||
M(Bool, regexp_dict_allow_hyperscan, true, "Allow regexp_tree dictionary using Hyperscan library.", 0) \
|
||||
\
|
||||
M(Bool, dictionary_use_async_executor, false, "Execute a pipeline for reading from a dictionary with several threads. It's supported only by DIRECT dictionary with CLICKHOUSE source.", 0) \
|
||||
M(Bool, input_format_csv_ignore_extra_columns, false, "Ignore extra colums in CSV input", 0) \
|
||||
|
||||
// End of FORMAT_FACTORY_SETTINGS
|
||||
// Please add settings non-related to formats into the COMMON_SETTINGS above.
|
||||
|
@ -63,7 +63,6 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.csv.delimiter = settings.format_csv_delimiter;
|
||||
format_settings.csv.tuple_delimiter = settings.format_csv_delimiter;
|
||||
format_settings.csv.empty_as_default = settings.input_format_csv_empty_as_default;
|
||||
format_settings.csv.ignore_extra_columns = settings.input_format_csv_ignore_extra_columns;
|
||||
format_settings.csv.enum_as_number = settings.input_format_csv_enum_as_number;
|
||||
format_settings.csv.null_representation = settings.format_csv_null_representation;
|
||||
format_settings.csv.arrays_as_nested_csv = settings.input_format_csv_arrays_as_nested_csv;
|
||||
@ -72,6 +71,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.csv.try_detect_header = settings.input_format_csv_detect_header;
|
||||
format_settings.csv.skip_trailing_empty_lines = settings.input_format_csv_skip_trailing_empty_lines;
|
||||
format_settings.csv.trim_whitespaces = settings.input_format_csv_trim_whitespaces;
|
||||
format_settings.csv.ignore_extra_columns = settings.input_format_csv_ignore_extra_columns;
|
||||
format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
|
||||
format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
|
||||
format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter;
|
||||
|
@ -128,7 +128,6 @@ struct FormatSettings
|
||||
bool allow_single_quotes = true;
|
||||
bool allow_double_quotes = true;
|
||||
bool empty_as_default = false;
|
||||
bool ignore_extra_columns = false;
|
||||
bool crlf_end_of_line = false;
|
||||
bool enum_as_number = false;
|
||||
bool arrays_as_nested_csv = false;
|
||||
@ -140,6 +139,7 @@ struct FormatSettings
|
||||
bool try_detect_header = true;
|
||||
bool skip_trailing_empty_lines = false;
|
||||
bool trim_whitespaces = true;
|
||||
bool ignore_extra_columns = false;
|
||||
} csv;
|
||||
|
||||
struct HiveText
|
||||
|
@ -288,6 +288,8 @@ bool CSVFormatReader::readField(
|
||||
const bool at_delimiter = !buf->eof() && *buf->position() == format_settings.csv.delimiter;
|
||||
const bool at_last_column_line_end = is_last_file_column && (buf->eof() || *buf->position() == '\n' || *buf->position() == '\r');
|
||||
|
||||
bool res = false;
|
||||
|
||||
/// Note: Tuples are serialized in CSV as separate columns, but with empty_as_default or null_as_default
|
||||
/// only one empty or NULL column will be expected
|
||||
if (format_settings.csv.empty_as_default && (at_delimiter || at_last_column_line_end))
|
||||
@ -299,31 +301,28 @@ bool CSVFormatReader::readField(
|
||||
/// they do not contain empty unquoted fields, so this check
|
||||
/// works for tuples as well.
|
||||
column.insertDefault();
|
||||
return false;
|
||||
}
|
||||
|
||||
auto skip_all = [&]()
|
||||
{
|
||||
if (!is_last_file_column || !format_settings.csv.ignore_extra_columns)
|
||||
{
|
||||
return;
|
||||
}
|
||||
//std::cout << "skip !!!" << std::endl;
|
||||
buf->position() = find_first_symbols<'\n'>(buf->position(), buf->buffer().end());
|
||||
};
|
||||
if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type))
|
||||
else if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type))
|
||||
{
|
||||
/// If value is null but type is not nullable then use default value instead.
|
||||
bool res = SerializationNullable::deserializeTextCSVImpl(column, *buf, format_settings, serialization);
|
||||
skip_all();
|
||||
return res;
|
||||
res = SerializationNullable::deserializeTextCSVImpl(column, *buf, format_settings, serialization);
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
/// Read the column normally.
|
||||
serialization->deserializeTextCSV(column, *buf, format_settings);
|
||||
res = true;
|
||||
}
|
||||
|
||||
skip_all();
|
||||
return true;
|
||||
if (is_last_file_column && format_settings.csv.ignore_extra_columns)
|
||||
{
|
||||
while (checkChar(format_settings.csv.delimiter, *buf))
|
||||
{
|
||||
skipField();
|
||||
skipWhitespacesAndTabs(*buf);
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
void CSVFormatReader::skipPrefixBeforeHeader()
|
||||
|
@ -212,8 +212,12 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE
|
||||
format_reader->skipRowStartDelimiter();
|
||||
|
||||
ext.read_columns.resize(data_types.size());
|
||||
//std::cout << "col size " << column_mapping->column_indexes_for_input_fields.size() << std::endl;
|
||||
for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column)
|
||||
{
|
||||
// std::cout << " file_column " << file_column << column_mapping->names_of_columns[file_column] << std::endl;
|
||||
|
||||
|
||||
const auto & column_index = column_mapping->column_indexes_for_input_fields[file_column];
|
||||
const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size();
|
||||
if (column_index)
|
||||
|
@ -11,7 +11,9 @@ default-eof 1 2019-06-19
|
||||
2016-01-01 01:02:03 NUL
|
||||
2016-01-02 01:02:03 Nhello
|
||||
\N \N
|
||||
Hello world 1 2016-01-01
|
||||
Hello world 2 2016-01-02
|
||||
Hello world 3 2016-01-03
|
||||
Hello world 4 2016-01-04
|
||||
Hello 1 String1
|
||||
Hello 2 String2
|
||||
Hello 3 String3
|
||||
Hello 4 String4
|
||||
Hello 5 String5
|
||||
Hello 6 String6
|
||||
|
@ -39,11 +39,14 @@ $CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY s NULLS LAST";
|
||||
$CLICKHOUSE_CLIENT --query="DROP TABLE csv";
|
||||
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="CREATE TABLE csv (s String, n UInt64 DEFAULT 1, d Date DEFAULT '2019-06-19') ENGINE = Memory";
|
||||
$CLICKHOUSE_CLIENT --query="CREATE TABLE csv (s String, n UInt64 DEFAULT 3, d String DEFAULT 'String4') ENGINE = Memory";
|
||||
|
||||
echo 'Hello world, 1, 2016-01-01
|
||||
Hello world, 2 ,2016-01-02,
|
||||
Hello world, 3 ,2016-01-03, 2016-01-13
|
||||
Hello world, 4 ,2016-01-04, 2016-01-14, 2016-01-15' | $CLICKHOUSE_CLIENT --input_format_csv_empty_as_default=1 --input_format_csv_ignore_extra_columns=1 --query="INSERT INTO csv FORMAT CSV";
|
||||
echo 'Hello, 1, String1
|
||||
Hello, 2, String2,
|
||||
Hello, 3, String3, 2016-01-13
|
||||
Hello, 4, , 2016-01-14
|
||||
Hello, 5, String5, 2016-01-15, 2016-01-16
|
||||
Hello, 6, String6, "line with a
|
||||
break"' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_csv_empty_as_default=1 --input_format_csv_ignore_extra_columns=1 --query="INSERT INTO csv FORMAT CSV";
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY s, n";
|
||||
$CLICKHOUSE_CLIENT --query="DROP TABLE csv";
|
Loading…
Reference in New Issue
Block a user