Add docs/ add more cases in test

This commit is contained in:
Dmitry Kardymon 2023-06-14 16:44:31 +00:00
parent ed318d1035
commit a91fc3ddb3
11 changed files with 56 additions and 36 deletions

View File

@ -470,6 +470,7 @@ The CSV format supports the output of totals and extremes the same way as `TabSe
- [input_format_csv_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_csv_detect_header) - automatically detect header with names and types in CSV format. Default value - `true`.
- [input_format_csv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`.
- [input_format_csv_trim_whitespaces](/docs/en/operations/settings/settings-formats.md/#input_format_csv_trim_whitespaces) - trim spaces and tabs in non-quoted CSV strings. Default value - `true`.
- [input_format_csv_ignore_extra_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_ignore_extra_columns) - ignore extra colums in CSV input. Default value - `false`.
## CSVWithNames {#csvwithnames}
@ -2062,7 +2063,7 @@ Special format for reading Parquet file metadata (https://parquet.apache.org/doc
- logical_type - column logical type
- compression - compression used for this column
- total_uncompressed_size - total uncompressed bytes size of the column, calculated as the sum of total_uncompressed_size of the column from all row groups
- total_compressed_size - total compressed bytes size of the column, calculated as the sum of total_compressed_size of the column from all row groups
- total_compressed_size - total compressed bytes size of the column, calculated as the sum of total_compressed_size of the column from all row groups
- space_saved - percent of space saved by compression, calculated as (1 - total_compressed_size/total_uncompressed_size).
- encodings - the list of encodings used for this column
- row_groups - the list of row groups metadata with the next structure:

View File

@ -931,6 +931,11 @@ Result
```text
" string "
```
### input_format_csv_ignore_extra_columns {#input_format_csv_ignore_extra_columns}
Ignore extra colums in CSV input.
Disabled by default.
## Values format settings {#values-format-settings}

View File

@ -401,8 +401,8 @@ $ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FOR
- [output_format_csv_crlf_end_of_line](../operations/settings/settings.md#output_format_csv_crlf_end_of_line) - если установлено значение true, конец строки в формате вывода CSV будет `\r\n` вместо `\n`. Значение по умолчанию - `false`.
- [input_format_csv_skip_first_lines](../operations/settings/settings.md#input_format_csv_skip_first_lines) - пропустить указанное количество строк в начале данных. Значение по умолчанию - `0`.
- [input_format_csv_detect_header](../operations/settings/settings.md#input_format_csv_detect_header) - обнаружить заголовок с именами и типами в формате CSV. Значение по умолчанию - `true`.
- [input_format_csv_trim_whitespaces](../operations/settings/settings.md#input_format_csv_trim_whitespaces) - удалить пробелы и символы табуляции из строк без кавычек.
Значение по умолчанию - `true`.
- [input_format_csv_trim_whitespaces](../operations/settings/settings.md#input_format_csv_trim_whitespaces) - удалить пробелы и символы табуляции из строк без кавычек. Значение по умолчанию - `true`.
- [input_format_csv_ignore_extra_columns](../operations/settings/settings.md/#input_format_csv_ignore_extra_columns) - игнорировать дополнительные столбцы. Значение по умолчанию - `false`.
## CSVWithNames {#csvwithnames}

View File

@ -1686,7 +1686,7 @@ SELECT * FROM table_with_enum_column_for_csv_insert;
## input_format_csv_detect_header {#input_format_csv_detect_header}
Обнаружить заголовок с именами и типами в формате CSV.
Значение по умолчанию - `true`.
## input_format_csv_skip_first_lines {#input_format_csv_skip_first_lines}
@ -1727,6 +1727,12 @@ echo ' string ' | ./clickhouse local -q "select * from table FORMAT CSV" --in
" string "
```
## input_format_csv_ignore_extra_columns {#input_format_csv_ignore_extra_columns}
Игнорировать дополнительные столбцы.
Выключено по умолчанию.
## output_format_tsv_crlf_end_of_line {#settings-output-format-tsv-crlf-end-of-line}
Использовать в качестве разделителя строк для TSV формата CRLF (DOC/Windows стиль) вместо LF (Unix стиль).

View File

@ -835,7 +835,6 @@ class IColumn;
M(Bool, input_format_import_nested_json, false, "Map nested JSON data to nested tables (it works for JSONEachRow format).", 0) \
M(Bool, input_format_defaults_for_omitted_fields, true, "For input data calculate default expressions for omitted fields (it works for JSONEachRow, -WithNames, -WithNamesAndTypes formats).", IMPORTANT) \
M(Bool, input_format_csv_empty_as_default, true, "Treat empty fields in CSV input as default values.", 0) \
M(Bool, input_format_csv_ignore_extra_columns, false, "", 0) \
M(Bool, input_format_tsv_empty_as_default, false, "Treat empty fields in TSV input as default values.", 0) \
M(Bool, input_format_tsv_enum_as_number, false, "Treat inserted enum values in TSV formats as enum indices.", 0) \
M(Bool, input_format_null_as_default, true, "Initialize null fields with default values if the data type of this field is not nullable and it is supported by the input format", 0) \
@ -1001,6 +1000,7 @@ class IColumn;
M(Bool, regexp_dict_allow_hyperscan, true, "Allow regexp_tree dictionary using Hyperscan library.", 0) \
\
M(Bool, dictionary_use_async_executor, false, "Execute a pipeline for reading from a dictionary with several threads. It's supported only by DIRECT dictionary with CLICKHOUSE source.", 0) \
M(Bool, input_format_csv_ignore_extra_columns, false, "Ignore extra colums in CSV input", 0) \
// End of FORMAT_FACTORY_SETTINGS
// Please add settings non-related to formats into the COMMON_SETTINGS above.

View File

@ -63,7 +63,6 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.csv.delimiter = settings.format_csv_delimiter;
format_settings.csv.tuple_delimiter = settings.format_csv_delimiter;
format_settings.csv.empty_as_default = settings.input_format_csv_empty_as_default;
format_settings.csv.ignore_extra_columns = settings.input_format_csv_ignore_extra_columns;
format_settings.csv.enum_as_number = settings.input_format_csv_enum_as_number;
format_settings.csv.null_representation = settings.format_csv_null_representation;
format_settings.csv.arrays_as_nested_csv = settings.input_format_csv_arrays_as_nested_csv;
@ -72,6 +71,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.csv.try_detect_header = settings.input_format_csv_detect_header;
format_settings.csv.skip_trailing_empty_lines = settings.input_format_csv_skip_trailing_empty_lines;
format_settings.csv.trim_whitespaces = settings.input_format_csv_trim_whitespaces;
format_settings.csv.ignore_extra_columns = settings.input_format_csv_ignore_extra_columns;
format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter;

View File

@ -128,7 +128,6 @@ struct FormatSettings
bool allow_single_quotes = true;
bool allow_double_quotes = true;
bool empty_as_default = false;
bool ignore_extra_columns = false;
bool crlf_end_of_line = false;
bool enum_as_number = false;
bool arrays_as_nested_csv = false;
@ -140,6 +139,7 @@ struct FormatSettings
bool try_detect_header = true;
bool skip_trailing_empty_lines = false;
bool trim_whitespaces = true;
bool ignore_extra_columns = false;
} csv;
struct HiveText

View File

@ -288,6 +288,8 @@ bool CSVFormatReader::readField(
const bool at_delimiter = !buf->eof() && *buf->position() == format_settings.csv.delimiter;
const bool at_last_column_line_end = is_last_file_column && (buf->eof() || *buf->position() == '\n' || *buf->position() == '\r');
bool res = false;
/// Note: Tuples are serialized in CSV as separate columns, but with empty_as_default or null_as_default
/// only one empty or NULL column will be expected
if (format_settings.csv.empty_as_default && (at_delimiter || at_last_column_line_end))
@ -299,31 +301,28 @@ bool CSVFormatReader::readField(
/// they do not contain empty unquoted fields, so this check
/// works for tuples as well.
column.insertDefault();
return false;
}
auto skip_all = [&]()
{
if (!is_last_file_column || !format_settings.csv.ignore_extra_columns)
{
return;
}
//std::cout << "skip !!!" << std::endl;
buf->position() = find_first_symbols<'\n'>(buf->position(), buf->buffer().end());
};
if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type))
else if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type))
{
/// If value is null but type is not nullable then use default value instead.
bool res = SerializationNullable::deserializeTextCSVImpl(column, *buf, format_settings, serialization);
skip_all();
return res;
res = SerializationNullable::deserializeTextCSVImpl(column, *buf, format_settings, serialization);
}
else
{
/// Read the column normally.
serialization->deserializeTextCSV(column, *buf, format_settings);
res = true;
}
/// Read the column normally.
serialization->deserializeTextCSV(column, *buf, format_settings);
skip_all();
return true;
if (is_last_file_column && format_settings.csv.ignore_extra_columns)
{
while (checkChar(format_settings.csv.delimiter, *buf))
{
skipField();
skipWhitespacesAndTabs(*buf);
}
}
return res;
}
void CSVFormatReader::skipPrefixBeforeHeader()

View File

@ -212,8 +212,12 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE
format_reader->skipRowStartDelimiter();
ext.read_columns.resize(data_types.size());
//std::cout << "col size " << column_mapping->column_indexes_for_input_fields.size() << std::endl;
for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column)
{
// std::cout << " file_column " << file_column << column_mapping->names_of_columns[file_column] << std::endl;
const auto & column_index = column_mapping->column_indexes_for_input_fields[file_column];
const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size();
if (column_index)

View File

@ -11,7 +11,9 @@ default-eof 1 2019-06-19
2016-01-01 01:02:03 NUL
2016-01-02 01:02:03 Nhello
\N \N
Hello world 1 2016-01-01
Hello world 2 2016-01-02
Hello world 3 2016-01-03
Hello world 4 2016-01-04
Hello 1 String1
Hello 2 String2
Hello 3 String3
Hello 4 String4
Hello 5 String5
Hello 6 String6

View File

@ -39,11 +39,14 @@ $CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY s NULLS LAST";
$CLICKHOUSE_CLIENT --query="DROP TABLE csv";
$CLICKHOUSE_CLIENT --query="CREATE TABLE csv (s String, n UInt64 DEFAULT 1, d Date DEFAULT '2019-06-19') ENGINE = Memory";
$CLICKHOUSE_CLIENT --query="CREATE TABLE csv (s String, n UInt64 DEFAULT 3, d String DEFAULT 'String4') ENGINE = Memory";
echo 'Hello world, 1, 2016-01-01
Hello world, 2 ,2016-01-02,
Hello world, 3 ,2016-01-03, 2016-01-13
Hello world, 4 ,2016-01-04, 2016-01-14, 2016-01-15' | $CLICKHOUSE_CLIENT --input_format_csv_empty_as_default=1 --input_format_csv_ignore_extra_columns=1 --query="INSERT INTO csv FORMAT CSV";
echo 'Hello, 1, String1
Hello, 2, String2,
Hello, 3, String3, 2016-01-13
Hello, 4, , 2016-01-14
Hello, 5, String5, 2016-01-15, 2016-01-16
Hello, 6, String6, "line with a
break"' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_csv_empty_as_default=1 --input_format_csv_ignore_extra_columns=1 --query="INSERT INTO csv FORMAT CSV";
$CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY s, n";
$CLICKHOUSE_CLIENT --query="DROP TABLE csv";