Merge remote-tracking branch 'origin/master' into ADQM-868

This commit is contained in:
Dmitry Kardymon 2023-07-11 08:27:01 +00:00
commit 44550d8cdd
14 changed files with 111 additions and 5 deletions

View File

@ -471,6 +471,7 @@ The CSV format supports the output of totals and extremes the same way as `TabSe
- [input_format_csv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`.
- [input_format_csv_trim_whitespaces](/docs/en/operations/settings/settings-formats.md/#input_format_csv_trim_whitespaces) - trim spaces and tabs in non-quoted CSV strings. Default value - `true`.
- [input_format_csv_allow_whitespace_or_tab_as_delimiter](/docs/en/operations/settings/settings-formats.md/# input_format_csv_allow_whitespace_or_tab_as_delimiter) - Allow to use whitespace or tab as field delimiter in CSV strings. Default value - `false`.
- [input_format_csv_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_allow_variable_number_of_columns) - ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values. Default value - `false`.
## CSVWithNames {#csvwithnames}

View File

@ -931,6 +931,11 @@ Result
```text
" string "
```
### input_format_csv_allow_variable_number_of_columns {#input_format_csv_allow_variable_number_of_columns}
ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values.
Disabled by default.
### input_format_csv_allow_whitespace_or_tab_as_delimiter {#input_format_csv_allow_whitespace_or_tab_as_delimiter}

View File

@ -401,8 +401,8 @@ $ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FOR
- [output_format_csv_crlf_end_of_line](../operations/settings/settings.md#output_format_csv_crlf_end_of_line) - если установлено значение true, конец строки в формате вывода CSV будет `\r\n` вместо `\n`. Значение по умолчанию - `false`.
- [input_format_csv_skip_first_lines](../operations/settings/settings.md#input_format_csv_skip_first_lines) - пропустить указанное количество строк в начале данных. Значение по умолчанию - `0`.
- [input_format_csv_detect_header](../operations/settings/settings.md#input_format_csv_detect_header) - обнаружить заголовок с именами и типами в формате CSV. Значение по умолчанию - `true`.
- [input_format_csv_trim_whitespaces](../operations/settings/settings.md#input_format_csv_trim_whitespaces) - удалить пробелы и символы табуляции из строк без кавычек.
Значение по умолчанию - `true`.
- [input_format_csv_trim_whitespaces](../operations/settings/settings.md#input_format_csv_trim_whitespaces) - удалить пробелы и символы табуляции из строк без кавычек. Значение по умолчанию - `true`.
- [input_format_csv_allow_variable_number_of_columns](../operations/settings/settings.md/#input_format_csv_allow_variable_number_of_columns) - игнорировать дополнительные столбцы (если файл содержит больше столбцов чем ожидается) и рассматривать отсутствующие поля в CSV в качестве значений по умолчанию. Значение по умолчанию - `false`.
## CSVWithNames {#csvwithnames}

View File

@ -1686,7 +1686,7 @@ SELECT * FROM table_with_enum_column_for_csv_insert;
## input_format_csv_detect_header {#input_format_csv_detect_header}
Обнаружить заголовок с именами и типами в формате CSV.
Значение по умолчанию - `true`.
## input_format_csv_skip_first_lines {#input_format_csv_skip_first_lines}
@ -1727,6 +1727,12 @@ echo ' string ' | ./clickhouse local -q "select * from table FORMAT CSV" --in
" string "
```
## input_format_csv_allow_variable_number_of_columns {#input_format_csv_allow_variable_number_of_columns}
Игнорировать дополнительные столбцы (если файл содержит больше столбцов чем ожидается) и рассматривать отсутствующие поля в CSV в качестве значений по умолчанию.
Выключено по умолчанию.
## output_format_tsv_crlf_end_of_line {#settings-output-format-tsv-crlf-end-of-line}
Использовать в качестве разделителя строк для TSV формата CRLF (DOC/Windows стиль) вместо LF (Unix стиль).

View File

@ -1011,6 +1011,7 @@ class IColumn;
M(Bool, regexp_dict_allow_hyperscan, true, "Allow regexp_tree dictionary using Hyperscan library.", 0) \
\
M(Bool, dictionary_use_async_executor, false, "Execute a pipeline for reading from a dictionary with several threads. It's supported only by DIRECT dictionary with CLICKHOUSE source.", 0) \
M(Bool, input_format_csv_allow_variable_number_of_columns, false, "Ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values", 0) \
// End of FORMAT_FACTORY_SETTINGS
// Please add settings non-related to formats into the COMMON_SETTINGS above.

View File

@ -138,7 +138,7 @@ Columns CacheDictionary<dictionary_key_type>::getColumns(
const Columns & default_values_columns) const
{
/**
* Flow of getColumsImpl
* Flow of getColumnsImpl
* 1. Get fetch result from storage
* 2. If all keys are found in storage and not expired
* 2.1. If storage returns fetched columns in order of keys then result is returned to client.

View File

@ -72,6 +72,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.csv.skip_trailing_empty_lines = settings.input_format_csv_skip_trailing_empty_lines;
format_settings.csv.trim_whitespaces = settings.input_format_csv_trim_whitespaces;
format_settings.csv.allow_whitespace_or_tab_as_delimiter = settings.input_format_csv_allow_whitespace_or_tab_as_delimiter;
format_settings.csv.allow_variable_number_of_columns = settings.input_format_csv_allow_variable_number_of_columns;
format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter;

View File

@ -140,6 +140,7 @@ struct FormatSettings
bool skip_trailing_empty_lines = false;
bool trim_whitespaces = true;
bool allow_whitespace_or_tab_as_delimiter = false;
bool allow_variable_number_of_columns = false;
} csv;
struct HiveText

View File

@ -283,6 +283,11 @@ bool CSVFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out)
return true;
}
bool CSVFormatReader::allowVariableNumberOfColumns()
{
return format_settings.csv.allow_variable_number_of_columns;
}
bool CSVFormatReader::readField(
IColumn & column,
const DataTypePtr & type,
@ -347,6 +352,12 @@ bool CSVFormatReader::checkForSuffix()
return false;
}
bool CSVFormatReader::checkForEndOfRow()
{
skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter);
return buf->eof() || *buf->position() == '\n' || *buf->position() == '\r';
}
CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_settings_)
: FormatWithNamesAndTypesSchemaReader(
buf,

View File

@ -69,6 +69,9 @@ public:
void skipRowEndDelimiter() override;
void skipPrefixBeforeHeader() override;
bool checkForEndOfRow() override;
bool allowVariableNumberOfColumns() override;
std::vector<String> readNames() override { return readHeaderRow(); }
std::vector<String> readTypes() override { return readHeaderRow(); }
std::vector<String> readHeaderRow() { return readRowImpl<true>(); }

View File

@ -227,7 +227,30 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE
format_reader->skipField(file_column);
if (!is_last_file_column)
{
if (format_reader->allowVariableNumberOfColumns() && format_reader->checkForEndOfRow())
{
++file_column;
while (file_column < column_mapping->column_indexes_for_input_fields.size())
{
const auto & rem_column_index = column_mapping->column_indexes_for_input_fields[file_column];
columns[*rem_column_index]->insertDefault();
++file_column;
}
}
else
format_reader->skipFieldDelimiter();
}
}
if (format_reader->allowVariableNumberOfColumns() && !format_reader->checkForEndOfRow())
{
do
{
format_reader->skipFieldDelimiter();
format_reader->skipField(1);
}
while (!format_reader->checkForEndOfRow());
}
format_reader->skipRowEndDelimiter();

View File

@ -119,6 +119,10 @@ public:
/// Check suffix.
virtual bool checkForSuffix() { return in->eof(); }
virtual bool checkForEndOfRow() { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method checkForEndOfRow is not implemented"); }
virtual bool allowVariableNumberOfColumns() { return false; }
const FormatSettings & getFormatSettings() const { return format_settings; }
virtual void setReadBuffer(ReadBuffer & in_) { in = &in_; }

View File

@ -1,13 +1,32 @@
=== Test input_format_csv_empty_as_default
Hello, world 123 2016-01-01
Hello, "world" 456 2016-01-02
Hello "world" 789 2016-01-03
Hello\n world 100 2016-01-04
default 1 2019-06-19
default-eof 1 2019-06-19
=== Test datetime
2016-01-01 01:02:03 1
2016-01-02 01:02:03 2
2017-08-15 13:15:01 3
1970-01-02 05:46:39 4
=== Test nullable datetime
2016-01-01 01:02:03 NUL
2016-01-02 01:02:03 Nhello
\N \N
=== Test ignore extra columns
Hello 1 String1
Hello 2 String2
Hello 3 String3
Hello 4 String4
Hello 5 String5
Hello 6 String6
=== Test missing as default
0 0 33 \N 55 Default
0 0 33 \N 55 Default
Hello 0 0 33 \N 55 Default
Hello 0 0 33 \N 55 Default
Hello 1 3 2 \N 55 Default
Hello 1 4 2 3 4 String
Hello 1 4 2 3 4 String
Hello 1 5 2 3 4 String

View File

@ -4,6 +4,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
echo === Test input_format_csv_empty_as_default
$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS csv";
$CLICKHOUSE_CLIENT --query="CREATE TABLE csv (s String, n UInt64 DEFAULT 1, d Date DEFAULT '2019-06-19') ENGINE = Memory";
@ -18,6 +19,7 @@ Hello "world", 789 ,2016-01-03
$CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY d, s";
$CLICKHOUSE_CLIENT --query="DROP TABLE csv";
echo === Test datetime
$CLICKHOUSE_CLIENT --query="CREATE TABLE csv (t DateTime('Asia/Istanbul'), s String) ENGINE = Memory";
echo '"2016-01-01 01:02:03","1"
@ -28,7 +30,7 @@ echo '"2016-01-01 01:02:03","1"
$CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY s";
$CLICKHOUSE_CLIENT --query="DROP TABLE csv";
echo === Test nullable datetime
$CLICKHOUSE_CLIENT --query="CREATE TABLE csv (t Nullable(DateTime('Asia/Istanbul')), s Nullable(String)) ENGINE = Memory";
echo 'NULL, NULL
@ -37,3 +39,32 @@ echo 'NULL, NULL
$CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY s NULLS LAST";
$CLICKHOUSE_CLIENT --query="DROP TABLE csv";
echo === Test ignore extra columns
$CLICKHOUSE_CLIENT --query="CREATE TABLE csv (s String, n UInt64 DEFAULT 3, d String DEFAULT 'String4') ENGINE = Memory";
echo '"Hello", 1, "String1"
"Hello", 2, "String2",
"Hello", 3, "String3", "2016-01-13"
"Hello", 4, , "2016-01-14"
"Hello", 5, "String5", "2016-01-15", "2016-01-16"
"Hello", 6, "String6" , "line with a
break"' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_csv_empty_as_default=1 --input_format_csv_allow_variable_number_of_columns=1 --query="INSERT INTO csv FORMAT CSV";
$CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY s, n";
$CLICKHOUSE_CLIENT --query="DROP TABLE csv";
echo === Test missing as default
$CLICKHOUSE_CLIENT --query="CREATE TABLE csv (f1 String, f2 UInt64, f3 UInt256, f4 UInt64 Default 33, f5 Nullable(UInt64), f6 Nullable(UInt64) Default 55, f7 String DEFAULT 'Default') ENGINE = Memory";
echo '
,
"Hello"
"Hello",
"Hello", 1, 3, 2
"Hello",1,4,2,3,4,"String"
"Hello", 1, 4, 2, 3, 4, "String"
"Hello", 1, 5, 2, 3, 4, "String",'| $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_csv_allow_variable_number_of_columns=1 --query="INSERT INTO csv FORMAT CSV";
$CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY f1, f2, f3, f4, f5 NULLS FIRST, f6, f7";
$CLICKHOUSE_CLIENT --query="DROP TABLE csv";