mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-10 01:25:21 +00:00
Merge remote-tracking branch 'origin/master' into ADQM-976
This commit is contained in:
commit
6d7e98590e
@ -471,6 +471,7 @@ The CSV format supports the output of totals and extremes the same way as `TabSe
|
||||
- [input_format_csv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`.
|
||||
- [input_format_csv_trim_whitespaces](/docs/en/operations/settings/settings-formats.md/#input_format_csv_trim_whitespaces) - trim spaces and tabs in non-quoted CSV strings. Default value - `true`.
|
||||
- [input_format_csv_allow_whitespace_or_tab_as_delimiter](/docs/en/operations/settings/settings-formats.md/# input_format_csv_allow_whitespace_or_tab_as_delimiter) - Allow to use whitespace or tab as field delimiter in CSV strings. Default value - `false`.
|
||||
- [input_format_csv_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_allow_variable_number_of_columns) - ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values. Default value - `false`.
|
||||
|
||||
## CSVWithNames {#csvwithnames}
|
||||
|
||||
|
@ -931,6 +931,11 @@ Result
|
||||
```text
|
||||
" string "
|
||||
```
|
||||
### input_format_csv_allow_variable_number_of_columns {#input_format_csv_allow_variable_number_of_columns}
|
||||
|
||||
ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values.
|
||||
|
||||
Disabled by default.
|
||||
|
||||
### input_format_csv_allow_whitespace_or_tab_as_delimiter {#input_format_csv_allow_whitespace_or_tab_as_delimiter}
|
||||
|
||||
|
@ -401,8 +401,8 @@ $ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FOR
|
||||
- [output_format_csv_crlf_end_of_line](../operations/settings/settings.md#output_format_csv_crlf_end_of_line) - если установлено значение true, конец строки в формате вывода CSV будет `\r\n` вместо `\n`. Значение по умолчанию - `false`.
|
||||
- [input_format_csv_skip_first_lines](../operations/settings/settings.md#input_format_csv_skip_first_lines) - пропустить указанное количество строк в начале данных. Значение по умолчанию - `0`.
|
||||
- [input_format_csv_detect_header](../operations/settings/settings.md#input_format_csv_detect_header) - обнаружить заголовок с именами и типами в формате CSV. Значение по умолчанию - `true`.
|
||||
- [input_format_csv_trim_whitespaces](../operations/settings/settings.md#input_format_csv_trim_whitespaces) - удалить пробелы и символы табуляции из строк без кавычек.
|
||||
Значение по умолчанию - `true`.
|
||||
- [input_format_csv_trim_whitespaces](../operations/settings/settings.md#input_format_csv_trim_whitespaces) - удалить пробелы и символы табуляции из строк без кавычек. Значение по умолчанию - `true`.
|
||||
- [input_format_csv_allow_variable_number_of_columns](../operations/settings/settings.md/#input_format_csv_allow_variable_number_of_columns) - игнорировать дополнительные столбцы (если файл содержит больше столбцов чем ожидается) и рассматривать отсутствующие поля в CSV в качестве значений по умолчанию. Значение по умолчанию - `false`.
|
||||
|
||||
## CSVWithNames {#csvwithnames}
|
||||
|
||||
|
@ -1686,7 +1686,7 @@ SELECT * FROM table_with_enum_column_for_csv_insert;
|
||||
## input_format_csv_detect_header {#input_format_csv_detect_header}
|
||||
|
||||
Обнаружить заголовок с именами и типами в формате CSV.
|
||||
|
||||
|
||||
Значение по умолчанию - `true`.
|
||||
|
||||
## input_format_csv_skip_first_lines {#input_format_csv_skip_first_lines}
|
||||
@ -1727,6 +1727,12 @@ echo ' string ' | ./clickhouse local -q "select * from table FORMAT CSV" --in
|
||||
" string "
|
||||
```
|
||||
|
||||
## input_format_csv_allow_variable_number_of_columns {#input_format_csv_allow_variable_number_of_columns}
|
||||
|
||||
Игнорировать дополнительные столбцы (если файл содержит больше столбцов чем ожидается) и рассматривать отсутствующие поля в CSV в качестве значений по умолчанию.
|
||||
|
||||
Выключено по умолчанию.
|
||||
|
||||
## output_format_tsv_crlf_end_of_line {#settings-output-format-tsv-crlf-end-of-line}
|
||||
|
||||
Использовать в качестве разделителя строк для TSV формата CRLF (DOC/Windows стиль) вместо LF (Unix стиль).
|
||||
|
@ -1011,6 +1011,7 @@ class IColumn;
|
||||
M(Bool, regexp_dict_allow_hyperscan, true, "Allow regexp_tree dictionary using Hyperscan library.", 0) \
|
||||
\
|
||||
M(Bool, dictionary_use_async_executor, false, "Execute a pipeline for reading from a dictionary with several threads. It's supported only by DIRECT dictionary with CLICKHOUSE source.", 0) \
|
||||
M(Bool, input_format_csv_allow_variable_number_of_columns, false, "Ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values", 0) \
|
||||
|
||||
// End of FORMAT_FACTORY_SETTINGS
|
||||
// Please add settings non-related to formats into the COMMON_SETTINGS above.
|
||||
|
@ -138,7 +138,7 @@ Columns CacheDictionary<dictionary_key_type>::getColumns(
|
||||
const Columns & default_values_columns) const
|
||||
{
|
||||
/**
|
||||
* Flow of getColumsImpl
|
||||
* Flow of getColumnsImpl
|
||||
* 1. Get fetch result from storage
|
||||
* 2. If all keys are found in storage and not expired
|
||||
* 2.1. If storage returns fetched columns in order of keys then result is returned to client.
|
||||
|
@ -72,6 +72,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.csv.skip_trailing_empty_lines = settings.input_format_csv_skip_trailing_empty_lines;
|
||||
format_settings.csv.trim_whitespaces = settings.input_format_csv_trim_whitespaces;
|
||||
format_settings.csv.allow_whitespace_or_tab_as_delimiter = settings.input_format_csv_allow_whitespace_or_tab_as_delimiter;
|
||||
format_settings.csv.allow_variable_number_of_columns = settings.input_format_csv_allow_variable_number_of_columns;
|
||||
format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
|
||||
format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
|
||||
format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter;
|
||||
|
@ -140,6 +140,7 @@ struct FormatSettings
|
||||
bool skip_trailing_empty_lines = false;
|
||||
bool trim_whitespaces = true;
|
||||
bool allow_whitespace_or_tab_as_delimiter = false;
|
||||
bool allow_variable_number_of_columns = false;
|
||||
} csv;
|
||||
|
||||
struct HiveText
|
||||
|
@ -283,6 +283,11 @@ bool CSVFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out)
|
||||
return true;
|
||||
}
|
||||
|
||||
bool CSVFormatReader::allowVariableNumberOfColumns()
|
||||
{
|
||||
return format_settings.csv.allow_variable_number_of_columns;
|
||||
}
|
||||
|
||||
bool CSVFormatReader::readField(
|
||||
IColumn & column,
|
||||
const DataTypePtr & type,
|
||||
@ -347,6 +352,12 @@ bool CSVFormatReader::checkForSuffix()
|
||||
return false;
|
||||
}
|
||||
|
||||
bool CSVFormatReader::checkForEndOfRow()
|
||||
{
|
||||
skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter);
|
||||
return buf->eof() || *buf->position() == '\n' || *buf->position() == '\r';
|
||||
}
|
||||
|
||||
CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_settings_)
|
||||
: FormatWithNamesAndTypesSchemaReader(
|
||||
buf,
|
||||
|
@ -69,6 +69,9 @@ public:
|
||||
void skipRowEndDelimiter() override;
|
||||
void skipPrefixBeforeHeader() override;
|
||||
|
||||
bool checkForEndOfRow() override;
|
||||
bool allowVariableNumberOfColumns() override;
|
||||
|
||||
std::vector<String> readNames() override { return readHeaderRow(); }
|
||||
std::vector<String> readTypes() override { return readHeaderRow(); }
|
||||
std::vector<String> readHeaderRow() { return readRowImpl<true>(); }
|
||||
|
@ -227,7 +227,30 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE
|
||||
format_reader->skipField(file_column);
|
||||
|
||||
if (!is_last_file_column)
|
||||
{
|
||||
if (format_reader->allowVariableNumberOfColumns() && format_reader->checkForEndOfRow())
|
||||
{
|
||||
++file_column;
|
||||
while (file_column < column_mapping->column_indexes_for_input_fields.size())
|
||||
{
|
||||
const auto & rem_column_index = column_mapping->column_indexes_for_input_fields[file_column];
|
||||
columns[*rem_column_index]->insertDefault();
|
||||
++file_column;
|
||||
}
|
||||
}
|
||||
else
|
||||
format_reader->skipFieldDelimiter();
|
||||
}
|
||||
}
|
||||
|
||||
if (format_reader->allowVariableNumberOfColumns() && !format_reader->checkForEndOfRow())
|
||||
{
|
||||
do
|
||||
{
|
||||
format_reader->skipFieldDelimiter();
|
||||
format_reader->skipField(1);
|
||||
}
|
||||
while (!format_reader->checkForEndOfRow());
|
||||
}
|
||||
|
||||
format_reader->skipRowEndDelimiter();
|
||||
|
@ -119,6 +119,10 @@ public:
|
||||
/// Check suffix.
|
||||
virtual bool checkForSuffix() { return in->eof(); }
|
||||
|
||||
virtual bool checkForEndOfRow() { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method checkForEndOfRow is not implemented"); }
|
||||
|
||||
virtual bool allowVariableNumberOfColumns() { return false; }
|
||||
|
||||
const FormatSettings & getFormatSettings() const { return format_settings; }
|
||||
|
||||
virtual void setReadBuffer(ReadBuffer & in_) { in = &in_; }
|
||||
|
@ -1,13 +1,32 @@
|
||||
=== Test input_format_csv_empty_as_default
|
||||
Hello, world 123 2016-01-01
|
||||
Hello, "world" 456 2016-01-02
|
||||
Hello "world" 789 2016-01-03
|
||||
Hello\n world 100 2016-01-04
|
||||
default 1 2019-06-19
|
||||
default-eof 1 2019-06-19
|
||||
=== Test datetime
|
||||
2016-01-01 01:02:03 1
|
||||
2016-01-02 01:02:03 2
|
||||
2017-08-15 13:15:01 3
|
||||
1970-01-02 05:46:39 4
|
||||
=== Test nullable datetime
|
||||
2016-01-01 01:02:03 NUL
|
||||
2016-01-02 01:02:03 Nhello
|
||||
\N \N
|
||||
=== Test ignore extra columns
|
||||
Hello 1 String1
|
||||
Hello 2 String2
|
||||
Hello 3 String3
|
||||
Hello 4 String4
|
||||
Hello 5 String5
|
||||
Hello 6 String6
|
||||
=== Test missing as default
|
||||
0 0 33 \N 55 Default
|
||||
0 0 33 \N 55 Default
|
||||
Hello 0 0 33 \N 55 Default
|
||||
Hello 0 0 33 \N 55 Default
|
||||
Hello 1 3 2 \N 55 Default
|
||||
Hello 1 4 2 3 4 String
|
||||
Hello 1 4 2 3 4 String
|
||||
Hello 1 5 2 3 4 String
|
||||
|
@ -4,6 +4,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
echo === Test input_format_csv_empty_as_default
|
||||
$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS csv";
|
||||
$CLICKHOUSE_CLIENT --query="CREATE TABLE csv (s String, n UInt64 DEFAULT 1, d Date DEFAULT '2019-06-19') ENGINE = Memory";
|
||||
|
||||
@ -18,6 +19,7 @@ Hello "world", 789 ,2016-01-03
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY d, s";
|
||||
$CLICKHOUSE_CLIENT --query="DROP TABLE csv";
|
||||
|
||||
echo === Test datetime
|
||||
$CLICKHOUSE_CLIENT --query="CREATE TABLE csv (t DateTime('Asia/Istanbul'), s String) ENGINE = Memory";
|
||||
|
||||
echo '"2016-01-01 01:02:03","1"
|
||||
@ -28,7 +30,7 @@ echo '"2016-01-01 01:02:03","1"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY s";
|
||||
$CLICKHOUSE_CLIENT --query="DROP TABLE csv";
|
||||
|
||||
|
||||
echo === Test nullable datetime
|
||||
$CLICKHOUSE_CLIENT --query="CREATE TABLE csv (t Nullable(DateTime('Asia/Istanbul')), s Nullable(String)) ENGINE = Memory";
|
||||
|
||||
echo 'NULL, NULL
|
||||
@ -37,3 +39,32 @@ echo 'NULL, NULL
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY s NULLS LAST";
|
||||
$CLICKHOUSE_CLIENT --query="DROP TABLE csv";
|
||||
|
||||
|
||||
echo === Test ignore extra columns
|
||||
$CLICKHOUSE_CLIENT --query="CREATE TABLE csv (s String, n UInt64 DEFAULT 3, d String DEFAULT 'String4') ENGINE = Memory";
|
||||
|
||||
echo '"Hello", 1, "String1"
|
||||
"Hello", 2, "String2",
|
||||
"Hello", 3, "String3", "2016-01-13"
|
||||
"Hello", 4, , "2016-01-14"
|
||||
"Hello", 5, "String5", "2016-01-15", "2016-01-16"
|
||||
"Hello", 6, "String6" , "line with a
|
||||
break"' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_csv_empty_as_default=1 --input_format_csv_allow_variable_number_of_columns=1 --query="INSERT INTO csv FORMAT CSV";
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY s, n";
|
||||
$CLICKHOUSE_CLIENT --query="DROP TABLE csv";
|
||||
|
||||
|
||||
echo === Test missing as default
|
||||
$CLICKHOUSE_CLIENT --query="CREATE TABLE csv (f1 String, f2 UInt64, f3 UInt256, f4 UInt64 Default 33, f5 Nullable(UInt64), f6 Nullable(UInt64) Default 55, f7 String DEFAULT 'Default') ENGINE = Memory";
|
||||
|
||||
echo '
|
||||
,
|
||||
"Hello"
|
||||
"Hello",
|
||||
"Hello", 1, 3, 2
|
||||
"Hello",1,4,2,3,4,"String"
|
||||
"Hello", 1, 4, 2, 3, 4, "String"
|
||||
"Hello", 1, 5, 2, 3, 4, "String",'| $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_csv_allow_variable_number_of_columns=1 --query="INSERT INTO csv FORMAT CSV";
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY f1, f2, f3, f4, f5 NULLS FIRST, f6, f7";
|
||||
$CLICKHOUSE_CLIENT --query="DROP TABLE csv";
|
||||
|
Loading…
Reference in New Issue
Block a user