From 6d77d52dfe034afe196fa1219ddc8897d1070146 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 27 Jul 2023 18:02:29 +0000 Subject: [PATCH 1/4] Allow variable number of columns in TSV/CuatomSeprarated/JSONCompactEachRow, make schema inference work with variable number of columns --- docs/en/interfaces/formats.md | 11 ++-- .../operations/settings/settings-formats.md | 25 +++++++- src/Core/Settings.h | 5 +- src/Formats/FormatFactory.cpp | 3 + src/Formats/FormatSettings.h | 3 + src/Processors/Formats/ISchemaReader.cpp | 46 +++++++++++--- src/Processors/Formats/ISchemaReader.h | 6 +- .../Formats/Impl/CSVRowInputFormat.cpp | 13 ++-- .../Formats/Impl/CSVRowInputFormat.h | 8 ++- .../Impl/CustomSeparatedRowInputFormat.cpp | 19 +++--- .../Impl/CustomSeparatedRowInputFormat.h | 10 +++- .../Impl/JSONCompactEachRowRowInputFormat.cpp | 8 ++- .../Impl/JSONCompactEachRowRowInputFormat.h | 7 ++- .../Formats/Impl/MsgPackRowInputFormat.cpp | 2 +- .../Formats/Impl/MsgPackRowInputFormat.h | 2 +- .../Formats/Impl/MySQLDumpRowInputFormat.cpp | 2 +- .../Formats/Impl/MySQLDumpRowInputFormat.h | 2 +- .../Formats/Impl/RegexpRowInputFormat.cpp | 2 +- .../Formats/Impl/RegexpRowInputFormat.h | 2 +- .../Impl/TabSeparatedRowInputFormat.cpp | 16 +++-- .../Formats/Impl/TabSeparatedRowInputFormat.h | 9 ++- .../Formats/Impl/TemplateRowInputFormat.cpp | 2 +- .../Formats/Impl/TemplateRowInputFormat.h | 2 +- .../Formats/Impl/ValuesBlockInputFormat.cpp | 2 +- .../Formats/Impl/ValuesBlockInputFormat.h | 2 +- .../RowInputFormatWithNamesAndTypes.cpp | 60 ++++++++++--------- .../Formats/RowInputFormatWithNamesAndTypes.h | 10 ++-- ..._with_variable_number_of_columns.reference | 52 ++++++++++++++++ ...ormats_with_variable_number_of_columns.sql | 18 ++++++ 29 files changed, 264 insertions(+), 85 deletions(-) create mode 100644 tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.reference create mode 100644 tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.sql diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 15f9d1f47bf..95483068cb2 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -195,6 +195,7 @@ SELECT * FROM nestedt FORMAT TSV - [input_format_tsv_skip_first_lines](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_skip_first_lines) - skip specified number of lines at the beginning of data. Default value - `0`. - [input_format_tsv_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_detect_header) - automatically detect header with names and types in TSV format. Default value - `true`. - [input_format_tsv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`. +- [input_format_tsv_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_allow_variable_number_of_columns) - allow variable number of columns in TSV format, ignore extra columns and use default values on missing columns. Default value - `false`. ## TabSeparatedRaw {#tabseparatedraw} @@ -472,7 +473,7 @@ The CSV format supports the output of totals and extremes the same way as `TabSe - [input_format_csv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`. - [input_format_csv_trim_whitespaces](/docs/en/operations/settings/settings-formats.md/#input_format_csv_trim_whitespaces) - trim spaces and tabs in non-quoted CSV strings. Default value - `true`. - [input_format_csv_allow_whitespace_or_tab_as_delimiter](/docs/en/operations/settings/settings-formats.md/# input_format_csv_allow_whitespace_or_tab_as_delimiter) - Allow to use whitespace or tab as field delimiter in CSV strings. Default value - `false`. -- [input_format_csv_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_allow_variable_number_of_columns) - ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values. Default value - `false`. +- [input_format_csv_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_allow_variable_number_of_columns) - allow variable number of columns in CSV format, ignore extra columns and use default values on missing columns. Default value - `false`. - [input_format_csv_use_default_on_bad_values](/docs/en/operations/settings/settings-formats.md/#input_format_csv_use_default_on_bad_values) - Allow to set default value to column when CSV field deserialization failed on bad value. Default value - `false`. ## CSVWithNames {#csvwithnames} @@ -501,9 +502,10 @@ the types from input data will be compared with the types of the corresponding c Similar to [Template](#format-template), but it prints or reads all names and types of columns and uses escaping rule from [format_custom_escaping_rule](/docs/en/operations/settings/settings-formats.md/#format_custom_escaping_rule) setting and delimiters from [format_custom_field_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_field_delimiter), [format_custom_row_before_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_row_before_delimiter), [format_custom_row_after_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_row_after_delimiter), [format_custom_row_between_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_row_between_delimiter), [format_custom_result_before_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_result_before_delimiter) and [format_custom_result_after_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_result_after_delimiter) settings, not from format strings. -If setting [input_format_custom_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_custom_detect_header) is enabled, ClickHouse will automatically detect header with names and types if any. - -If setting [input_format_tsv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_custom_detect_header) is enabled, trailing empty lines at the end of file will be skipped. +Additional settings: +- [input_format_custom_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_custom_detect_header) - enables automatic detection of header with names and types if any. Default value - `true`. +- [input_format_custom_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_custom_skip_trailing_empty_lines) - skip trailing empty lines at the end of file . Default value - `false`. +- [input_format_custom_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_custom_allow_variable_number_of_columns) - allow variable number of columns in CustomSeparated format, ignore extra columns and use default values on missing columns. Default value - `false`. There is also `CustomSeparatedIgnoreSpaces` format, which is similar to [TemplateIgnoreSpaces](#templateignorespaces). @@ -1261,6 +1263,7 @@ SELECT * FROM json_each_row_nested - [input_format_json_named_tuples_as_objects](/docs/en/operations/settings/settings-formats.md/#input_format_json_named_tuples_as_objects) - parse named tuple columns as JSON objects. Default value - `true`. - [input_format_json_defaults_for_missing_elements_in_named_tuple](/docs/en/operations/settings/settings-formats.md/#input_format_json_defaults_for_missing_elements_in_named_tuple) - insert default values for missing elements in JSON object while parsing named tuple. Default value - `true`. - [input_format_json_ignore_unknown_keys_in_named_tuple](/docs/en/operations/settings/settings-formats.md/#input_format_json_ignore_unknown_keys_in_named_tuple) - Ignore unknown keys in json object for named tuples. Default value - `false`. +- [input_format_json_compact_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_json_compact_allow_variable_number_of_columns) - allow variable number of columns in JSONCompact/JSONCompactEachRow format, ignore extra columns and use default values on missing columns. Default value - `false`. - [output_format_json_quote_64bit_integers](/docs/en/operations/settings/settings-formats.md/#output_format_json_quote_64bit_integers) - controls quoting of 64-bit integers in JSON output format. Default value - `true`. - [output_format_json_quote_64bit_floats](/docs/en/operations/settings/settings-formats.md/#output_format_json_quote_64bit_floats) - controls quoting of 64-bit floats in JSON output format. Default value - `false`. - [output_format_json_quote_denormals](/docs/en/operations/settings/settings-formats.md/#output_format_json_quote_denormals) - enables '+nan', '-nan', '+inf', '-inf' outputs in JSON output format. Default value - `false`. diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index ee8e0d547b8..8e3d6b74ffa 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -623,6 +623,13 @@ Column type should be String. If value is empty, default names `row_{i}`will be Default value: ''. +### input_format_json_compact_allow_variable_number_of_columns {#input_format_json_compact_allow_variable_number_of_columns} + +Allow variable number of columns in rows in JSONCompact/JSONCompactEachRow input formats. +Ignore extra columns in rows with more columns than expected and treat missing columns as default values. + +Disabled by default. + ## TSV format settings {#tsv-format-settings} ### input_format_tsv_empty_as_default {#input_format_tsv_empty_as_default} @@ -760,6 +767,13 @@ When enabled, trailing empty lines at the end of TSV file will be skipped. Disabled by default. +### input_format_tsv_allow_variable_number_of_columns {#input_format_tsv_allow_variable_number_of_columns} + +Allow variable number of columns in rows in TSV input format. +Ignore extra columns in rows with more columns than expected and treat missing columns as default values. + +Disabled by default. + ## CSV format settings {#csv-format-settings} ### format_csv_delimiter {#format_csv_delimiter} @@ -951,9 +965,11 @@ Result ```text " string " ``` + ### input_format_csv_allow_variable_number_of_columns {#input_format_csv_allow_variable_number_of_columns} -ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values. +Allow variable number of columns in rows in CSV input format. +Ignore extra columns in rows with more columns than expected and treat missing columns as default values. Disabled by default. @@ -1589,6 +1605,13 @@ When enabled, trailing empty lines at the end of file in CustomSeparated format Disabled by default. +### input_format_custom_allow_variable_number_of_columns {#input_format_custom_allow_variable_number_of_columns} + +Allow variable number of columns in rows in CustomSeparated input format. +Ignore extra columns in rows with more columns than expected and treat missing columns as default values. + +Disabled by default. + ## Regexp format settings {#regexp-format-settings} ### format_regexp_escaping_rule {#format_regexp_escaping_rule} diff --git a/src/Core/Settings.h b/src/Core/Settings.h index c69d132ea25..86146bfad07 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -879,6 +879,10 @@ class IColumn; M(Bool, input_format_csv_allow_whitespace_or_tab_as_delimiter, false, "Allow to use spaces and tabs(\\t) as field delimiter in the CSV strings", 0) \ M(Bool, input_format_csv_trim_whitespaces, true, "Trims spaces and tabs (\\t) characters at the beginning and end in CSV strings", 0) \ M(Bool, input_format_csv_use_default_on_bad_values, false, "Allow to set default value to column when CSV field deserialization failed on bad value", 0) \ + M(Bool, input_format_csv_allow_variable_number_of_columns, false, "Ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values", 0) \ + M(Bool, input_format_tsv_allow_variable_number_of_columns, false, "Ignore extra columns in TSV input (if file has more columns than expected) and treat missing fields in TSV input as default values", 0) \ + M(Bool, input_format_custom_allow_variable_number_of_columns, false, "Ignore extra columns in CustomSeparated input (if file has more columns than expected) and treat missing fields in CustomSeparated input as default values", 0) \ + M(Bool, input_format_json_compact_allow_variable_number_of_columns, false, "Ignore extra columns in JSONCompact(EachRow) input (if file has more columns than expected) and treat missing fields in JSONCompact(EachRow) input as default values", 0) \ M(Bool, input_format_tsv_detect_header, true, "Automatically detect header with names and types in TSV format", 0) \ M(Bool, input_format_custom_detect_header, true, "Automatically detect header with names and types in CustomSeparated format", 0) \ M(Bool, input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format Parquet", 0) \ @@ -1023,7 +1027,6 @@ class IColumn; M(Bool, regexp_dict_allow_hyperscan, true, "Allow regexp_tree dictionary using Hyperscan library.", 0) \ \ M(Bool, dictionary_use_async_executor, false, "Execute a pipeline for reading from a dictionary with several threads. It's supported only by DIRECT dictionary with CLICKHOUSE source.", 0) \ - M(Bool, input_format_csv_allow_variable_number_of_columns, false, "Ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values", 0) \ // End of FORMAT_FACTORY_SETTINGS // Please add settings non-related to formats into the COMMON_SETTINGS above. diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 663b7f1ba95..dff480d1f79 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -86,6 +86,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.custom.row_between_delimiter = settings.format_custom_row_between_delimiter; format_settings.custom.try_detect_header = settings.input_format_custom_detect_header; format_settings.custom.skip_trailing_empty_lines = settings.input_format_custom_skip_trailing_empty_lines; + format_settings.custom.allow_variable_number_of_columns = settings.input_format_custom_allow_variable_number_of_columns; format_settings.date_time_input_format = settings.date_time_input_format; format_settings.date_time_output_format = settings.date_time_output_format; format_settings.interval.output_format = settings.interval_output_format; @@ -115,6 +116,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.json.validate_utf8 = settings.output_format_json_validate_utf8; format_settings.json_object_each_row.column_for_object_name = settings.format_json_object_each_row_column_for_object_name; format_settings.json.allow_object_type = context->getSettingsRef().allow_experimental_object_type; + format_settings.json.compact_allow_variable_number_of_columns = settings.input_format_json_compact_allow_variable_number_of_columns; format_settings.null_as_default = settings.input_format_null_as_default; format_settings.decimal_trailing_zeros = settings.output_format_decimal_trailing_zeros; format_settings.parquet.row_group_rows = settings.output_format_parquet_row_group_size; @@ -161,6 +163,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.tsv.skip_first_lines = settings.input_format_tsv_skip_first_lines; format_settings.tsv.try_detect_header = settings.input_format_tsv_detect_header; format_settings.tsv.skip_trailing_empty_lines = settings.input_format_tsv_skip_trailing_empty_lines; + format_settings.tsv.allow_variable_number_of_columns = settings.input_format_tsv_allow_variable_number_of_columns; format_settings.values.accurate_types_of_literals = settings.input_format_values_accurate_types_of_literals; format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions; format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 3259c46e5ff..68cf9ad817d 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -176,6 +176,7 @@ struct FormatSettings EscapingRule escaping_rule = EscapingRule::Escaped; bool try_detect_header = true; bool skip_trailing_empty_lines = false; + bool allow_variable_number_of_columns = false; } custom; struct @@ -198,6 +199,7 @@ struct FormatSettings bool validate_types_from_metadata = true; bool validate_utf8 = false; bool allow_object_type = false; + bool compact_allow_variable_number_of_columns = false; } json; struct @@ -316,6 +318,7 @@ struct FormatSettings UInt64 skip_first_lines = 0; bool try_detect_header = true; bool skip_trailing_empty_lines = false; + bool allow_variable_number_of_columns = false; } tsv; struct diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp index 1fa520eaaee..15b53c2a499 100644 --- a/src/Processors/Formats/ISchemaReader.cpp +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -115,21 +115,24 @@ NamesAndTypesList IRowSchemaReader::readSchema() "Cannot read rows to determine the schema, the maximum number of rows (or bytes) to read is set to 0. " "Most likely setting input_format_max_rows_to_read_for_schema_inference or input_format_max_bytes_to_read_for_schema_inference is set to 0"); - DataTypes data_types = readRowAndGetDataTypes(); + auto data_types_maybe = readRowAndGetDataTypes(); /// Check that we read at list one column. - if (data_types.empty()) + if (!data_types_maybe) throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "Cannot read rows from the data"); + DataTypes data_types = std::move(*data_types_maybe); + /// If column names weren't set, use default names 'c1', 'c2', ... - if (column_names.empty()) + bool use_default_column_names = column_names.empty(); + if (use_default_column_names) { column_names.reserve(data_types.size()); for (size_t i = 0; i != data_types.size(); ++i) column_names.push_back("c" + std::to_string(i + 1)); } /// If column names were set, check that the number of names match the number of types. - else if (column_names.size() != data_types.size()) + else if (column_names.size() != data_types.size() && !allowVariableNumberOfColumns()) { throw Exception( ErrorCodes::INCORRECT_DATA, @@ -137,6 +140,9 @@ NamesAndTypesList IRowSchemaReader::readSchema() } else { + if (column_names.size() != data_types.size()) + data_types.resize(column_names.size()); + std::unordered_set names_set; for (const auto & name : column_names) { @@ -155,13 +161,39 @@ NamesAndTypesList IRowSchemaReader::readSchema() for (rows_read = 1; rows_read < max_rows_to_read && in.count() < max_bytes_to_read; ++rows_read) { - DataTypes new_data_types = readRowAndGetDataTypes(); - if (new_data_types.empty()) + auto new_data_types_maybe = readRowAndGetDataTypes(); + if (!new_data_types_maybe) /// We reached eof. break; + DataTypes new_data_types = std::move(*new_data_types_maybe); + if (new_data_types.size() != data_types.size()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Rows have different amount of values"); + { + if (!allowVariableNumberOfColumns()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Rows have different amount of values"); + + if (use_default_column_names) + { + /// Current row contains new columns, add new default names. + if (new_data_types.size() > data_types.size()) + { + for (size_t i = data_types.size(); i < new_data_types.size(); ++i) + column_names.push_back("c" + std::to_string(i + 1)); + data_types.resize(new_data_types.size()); + } + /// Current row contain less columns than previous rows. + else + { + new_data_types.resize(data_types.size()); + } + } + /// If names were explicitly set, ignore all extra columns. + else + { + new_data_types.resize(column_names.size()); + } + } for (field_index = 0; field_index != data_types.size(); ++field_index) { diff --git a/src/Processors/Formats/ISchemaReader.h b/src/Processors/Formats/ISchemaReader.h index 40702198a57..0cc8b98f05e 100644 --- a/src/Processors/Formats/ISchemaReader.h +++ b/src/Processors/Formats/ISchemaReader.h @@ -93,11 +93,13 @@ protected: /// Read one row and determine types of columns in it. /// Return types in the same order in which the values were in the row. /// If it's impossible to determine the type for some column, return nullptr for it. - /// Return empty list if can't read more data. - virtual DataTypes readRowAndGetDataTypes() = 0; + /// Return std::nullopt if can't read more data. + virtual std::optional readRowAndGetDataTypes() = 0; void setColumnNames(const std::vector & names) { column_names = names; } + virtual bool allowVariableNumberOfColumns() const { return false; } + size_t field_index; private: diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 244b906549e..9092c7fceba 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -284,7 +284,7 @@ bool CSVFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) return true; } -bool CSVFormatReader::allowVariableNumberOfColumns() +bool CSVFormatReader::allowVariableNumberOfColumns() const { return format_settings.csv.allow_variable_number_of_columns; } @@ -410,19 +410,22 @@ CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_t { } -std::pair, DataTypes> CSVSchemaReader::readRowAndGetFieldsAndDataTypes() +std::optional, DataTypes>> CSVSchemaReader::readRowAndGetFieldsAndDataTypes() { if (buf.eof()) return {}; auto fields = reader.readRow(); auto data_types = tryInferDataTypesByEscapingRule(fields, format_settings, FormatSettings::EscapingRule::CSV); - return {fields, data_types}; + return std::make_pair(fields, data_types); } -DataTypes CSVSchemaReader::readRowAndGetDataTypesImpl() +std::optional CSVSchemaReader::readRowAndGetDataTypesImpl() { - return std::move(readRowAndGetFieldsAndDataTypes().second); + auto fields_with_types = readRowAndGetFieldsAndDataTypes(); + if (!fields_with_types) + return {}; + return std::move(fields_with_types->second); } diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.h b/src/Processors/Formats/Impl/CSVRowInputFormat.h index 7b1a1fc433d..2444477b184 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.h +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.h @@ -70,7 +70,7 @@ public: void skipPrefixBeforeHeader() override; bool checkForEndOfRow() override; - bool allowVariableNumberOfColumns() override; + bool allowVariableNumberOfColumns() const override; std::vector readNames() override { return readHeaderRow(); } std::vector readTypes() override { return readHeaderRow(); } @@ -102,8 +102,10 @@ public: CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_settings_); private: - DataTypes readRowAndGetDataTypesImpl() override; - std::pair, DataTypes> readRowAndGetFieldsAndDataTypes() override; + bool allowVariableNumberOfColumns() const override { return format_settings.csv.allow_variable_number_of_columns; } + + std::optional readRowAndGetDataTypesImpl() override; + std::optional, DataTypes>> readRowAndGetFieldsAndDataTypes() override; PeekableReadBuffer buf; CSVFormatReader reader; diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp index 1e67db79a2c..8f8e12e3c2a 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp @@ -142,7 +142,7 @@ void CustomSeparatedFormatReader::skipField() skipFieldByEscapingRule(*buf, format_settings.custom.escaping_rule, format_settings); } -bool CustomSeparatedFormatReader::checkEndOfRow() +bool CustomSeparatedFormatReader::checkForEndOfRow() { PeekableReadBufferCheckpoint checkpoint{*buf, true}; @@ -200,12 +200,12 @@ std::vector CustomSeparatedFormatReader::readRowImpl() std::vector values; skipRowStartDelimiter(); - if (columns == 0) + if (columns == 0 || allowVariableNumberOfColumns()) { do { values.push_back(readFieldIntoString(values.empty(), false, true)); - } while (!checkEndOfRow()); + } while (!checkForEndOfRow()); columns = values.size(); } else @@ -230,7 +230,7 @@ void CustomSeparatedFormatReader::skipHeaderRow() skipField(); } - while (!checkEndOfRow()); + while (!checkForEndOfRow()); skipRowEndDelimiter(); } @@ -369,7 +369,7 @@ CustomSeparatedSchemaReader::CustomSeparatedSchemaReader( { } -std::pair, DataTypes> CustomSeparatedSchemaReader::readRowAndGetFieldsAndDataTypes() +std::optional, DataTypes>> CustomSeparatedSchemaReader::readRowAndGetFieldsAndDataTypes() { if (no_more_data || reader.checkForSuffix()) { @@ -385,12 +385,15 @@ std::pair, DataTypes> CustomSeparatedSchemaReader::readRowAn auto fields = reader.readRow(); auto data_types = tryInferDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule(), &json_inference_info); - return {fields, data_types}; + return std::make_pair(fields, data_types); } -DataTypes CustomSeparatedSchemaReader::readRowAndGetDataTypesImpl() +std::optional CustomSeparatedSchemaReader::readRowAndGetDataTypesImpl() { - return readRowAndGetFieldsAndDataTypes().second; + auto fields_with_types = readRowAndGetFieldsAndDataTypes(); + if (!fields_with_types) + return {}; + return std::move(fields_with_types->second); } void CustomSeparatedSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h index 2acf35bd143..893f06409f6 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h @@ -74,7 +74,9 @@ public: std::vector readRowForHeaderDetection() override { return readRowImpl(); } - bool checkEndOfRow(); + bool checkForEndOfRow() override; + bool allowVariableNumberOfColumns() const override { return format_settings.custom.allow_variable_number_of_columns; } + bool checkForSuffixImpl(bool check_eof); inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(*buf, true); } @@ -109,9 +111,11 @@ public: CustomSeparatedSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_); private: - DataTypes readRowAndGetDataTypesImpl() override; + bool allowVariableNumberOfColumns() const override { return format_settings.custom.allow_variable_number_of_columns; } - std::pair, DataTypes> readRowAndGetFieldsAndDataTypes() override; + std::optional readRowAndGetDataTypesImpl() override; + + std::optional, DataTypes>> readRowAndGetFieldsAndDataTypes() override; void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) override; diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp index b91345bebe3..e3583a3dff0 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp @@ -112,6 +112,12 @@ bool JSONCompactEachRowFormatReader::readField(IColumn & column, const DataTypeP return JSONUtils::readField(*in, column, type, serialization, column_name, format_settings, yield_strings); } +bool JSONCompactEachRowFormatReader::checkForEndOfRow() +{ + skipWhitespaceIfAny(*in); + return !in->eof() && *in->position() == ']'; +} + bool JSONCompactEachRowFormatReader::parseRowStartWithDiagnosticInfo(WriteBuffer & out) { skipWhitespaceIfAny(*in); @@ -187,7 +193,7 @@ JSONCompactEachRowRowSchemaReader::JSONCompactEachRowRowSchemaReader( { } -DataTypes JSONCompactEachRowRowSchemaReader::readRowAndGetDataTypesImpl() +std::optional JSONCompactEachRowRowSchemaReader::readRowAndGetDataTypesImpl() { if (first_row) first_row = false; diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h index bb699f0ca2e..378a41e6471 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h @@ -68,6 +68,9 @@ public: std::vector readNames() override { return readHeaderRow(); } std::vector readTypes() override { return readHeaderRow(); } + bool checkForEndOfRow() override; + bool allowVariableNumberOfColumns() const override { return format_settings.json.compact_allow_variable_number_of_columns; } + bool yieldStrings() const { return yield_strings; } private: bool yield_strings; @@ -79,7 +82,9 @@ public: JSONCompactEachRowRowSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool yield_strings_, const FormatSettings & format_settings_); private: - DataTypes readRowAndGetDataTypesImpl() override; + bool allowVariableNumberOfColumns() const override { return format_settings.json.compact_allow_variable_number_of_columns; } + + std::optional readRowAndGetDataTypesImpl() override; void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) override; void transformFinalTypeIfNeeded(DataTypePtr & type) override; diff --git a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp index eeca14176cc..a46f0018611 100644 --- a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp @@ -634,7 +634,7 @@ DataTypePtr MsgPackSchemaReader::getDataType(const msgpack::object & object) UNREACHABLE(); } -DataTypes MsgPackSchemaReader::readRowAndGetDataTypes() +std::optional MsgPackSchemaReader::readRowAndGetDataTypes() { if (buf.eof()) return {}; diff --git a/src/Processors/Formats/Impl/MsgPackRowInputFormat.h b/src/Processors/Formats/Impl/MsgPackRowInputFormat.h index 0b485d3b97c..028ab878ad0 100644 --- a/src/Processors/Formats/Impl/MsgPackRowInputFormat.h +++ b/src/Processors/Formats/Impl/MsgPackRowInputFormat.h @@ -91,7 +91,7 @@ public: private: msgpack::object_handle readObject(); DataTypePtr getDataType(const msgpack::object & object); - DataTypes readRowAndGetDataTypes() override; + std::optional readRowAndGetDataTypes() override; PeekableReadBuffer buf; UInt64 number_of_columns; diff --git a/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp b/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp index 90dd07bd5a8..6c754f141da 100644 --- a/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp @@ -422,7 +422,7 @@ NamesAndTypesList MySQLDumpSchemaReader::readSchema() return IRowSchemaReader::readSchema(); } -DataTypes MySQLDumpSchemaReader::readRowAndGetDataTypes() +std::optional MySQLDumpSchemaReader::readRowAndGetDataTypes() { if (in.eof()) return {}; diff --git a/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.h b/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.h index c28355054d7..14a73bf83b0 100644 --- a/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.h +++ b/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.h @@ -33,7 +33,7 @@ public: private: NamesAndTypesList readSchema() override; - DataTypes readRowAndGetDataTypes() override; + std::optional readRowAndGetDataTypes() override; String table_name; }; diff --git a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp index d902a8be6a7..8e94a568b1e 100644 --- a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp @@ -143,7 +143,7 @@ RegexpSchemaReader::RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & { } -DataTypes RegexpSchemaReader::readRowAndGetDataTypes() +std::optional RegexpSchemaReader::readRowAndGetDataTypes() { if (buf.eof()) return {}; diff --git a/src/Processors/Formats/Impl/RegexpRowInputFormat.h b/src/Processors/Formats/Impl/RegexpRowInputFormat.h index 2469774aaf9..7417d48d8c1 100644 --- a/src/Processors/Formats/Impl/RegexpRowInputFormat.h +++ b/src/Processors/Formats/Impl/RegexpRowInputFormat.h @@ -79,7 +79,7 @@ public: RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings); private: - DataTypes readRowAndGetDataTypes() override; + std::optional readRowAndGetDataTypes() override; void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) override; diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index 2239c8539e3..7fbad583ced 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -300,6 +300,11 @@ bool TabSeparatedFormatReader::checkForSuffix() return false; } +bool TabSeparatedFormatReader::checkForEndOfRow() +{ + return buf->eof() || *buf->position() == '\n'; +} + TabSeparatedSchemaReader::TabSeparatedSchemaReader( ReadBuffer & in_, bool with_names_, bool with_types_, bool is_raw_, const FormatSettings & format_settings_) : FormatWithNamesAndTypesSchemaReader( @@ -315,19 +320,22 @@ TabSeparatedSchemaReader::TabSeparatedSchemaReader( { } -std::pair, DataTypes> TabSeparatedSchemaReader::readRowAndGetFieldsAndDataTypes() +std::optional, DataTypes>> TabSeparatedSchemaReader::readRowAndGetFieldsAndDataTypes() { if (buf.eof()) return {}; auto fields = reader.readRow(); auto data_types = tryInferDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule()); - return {fields, data_types}; + return std::make_pair(fields, data_types); } -DataTypes TabSeparatedSchemaReader::readRowAndGetDataTypesImpl() +std::optional TabSeparatedSchemaReader::readRowAndGetDataTypesImpl() { - return readRowAndGetFieldsAndDataTypes().second; + auto fields_with_types = readRowAndGetFieldsAndDataTypes(); + if (!fields_with_types) + return {}; + return std::move(fields_with_types->second); } void registerInputFormatTabSeparated(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h index 8df57675cf5..e0234761d61 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h @@ -76,6 +76,9 @@ public: void setReadBuffer(ReadBuffer & in_) override; bool checkForSuffix() override; + bool checkForEndOfRow() override; + + bool allowVariableNumberOfColumns() const override { return format_settings.tsv.allow_variable_number_of_columns; } private: template @@ -92,8 +95,10 @@ public: TabSeparatedSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool is_raw_, const FormatSettings & format_settings); private: - DataTypes readRowAndGetDataTypesImpl() override; - std::pair, DataTypes> readRowAndGetFieldsAndDataTypes() override; + bool allowVariableNumberOfColumns() const override { return format_settings.tsv.allow_variable_number_of_columns; } + + std::optional readRowAndGetDataTypesImpl() override; + std::optional, DataTypes>> readRowAndGetFieldsAndDataTypes() override; PeekableReadBuffer buf; TabSeparatedFormatReader reader; diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp index 8a09e800fa7..b065e00f5d1 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp @@ -490,7 +490,7 @@ TemplateSchemaReader::TemplateSchemaReader( setColumnNames(row_format.column_names); } -DataTypes TemplateSchemaReader::readRowAndGetDataTypes() +std::optional TemplateSchemaReader::readRowAndGetDataTypes() { if (first_row) format_reader.readPrefix(); diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.h b/src/Processors/Formats/Impl/TemplateRowInputFormat.h index 8f9088e2c47..2752cb13e50 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.h +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.h @@ -119,7 +119,7 @@ public: std::string row_between_delimiter, const FormatSettings & format_settings_); - DataTypes readRowAndGetDataTypes() override; + std::optional readRowAndGetDataTypes() override; private: void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) override; diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp index 3a65a6fe4ea..6cb469afca1 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp @@ -638,7 +638,7 @@ ValuesSchemaReader::ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & { } -DataTypes ValuesSchemaReader::readRowAndGetDataTypes() +std::optional ValuesSchemaReader::readRowAndGetDataTypes() { if (first_row) { diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h index 8f8d44ec088..7f1dbc0da66 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h @@ -105,7 +105,7 @@ public: ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings); private: - DataTypes readRowAndGetDataTypes() override; + std::optional readRowAndGetDataTypes() override; PeekableReadBuffer buf; ParserExpression parser; diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp index fb49779e0af..cb5c11e2d3b 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp @@ -212,8 +212,23 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE format_reader->skipRowStartDelimiter(); ext.read_columns.resize(data_types.size()); - for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column) + size_t file_column = 0; + for (; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column) { + if (format_reader->allowVariableNumberOfColumns() && format_reader->checkForEndOfRow()) + { + while (file_column < column_mapping->column_indexes_for_input_fields.size()) + { + const auto & rem_column_index = column_mapping->column_indexes_for_input_fields[file_column]; + columns[*rem_column_index]->insertDefault(); + ++file_column; + } + break; + } + + if (file_column != 0) + format_reader->skipFieldDelimiter(); + const auto & column_index = column_mapping->column_indexes_for_input_fields[file_column]; const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size(); if (column_index) @@ -225,22 +240,6 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE column_mapping->names_of_columns[file_column]); else format_reader->skipField(file_column); - - if (!is_last_file_column) - { - if (format_reader->allowVariableNumberOfColumns() && format_reader->checkForEndOfRow()) - { - ++file_column; - while (file_column < column_mapping->column_indexes_for_input_fields.size()) - { - const auto & rem_column_index = column_mapping->column_indexes_for_input_fields[file_column]; - columns[*rem_column_index]->insertDefault(); - ++file_column; - } - } - else - format_reader->skipFieldDelimiter(); - } } if (format_reader->allowVariableNumberOfColumns() && !format_reader->checkForEndOfRow()) @@ -248,7 +247,7 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE do { format_reader->skipFieldDelimiter(); - format_reader->skipField(1); + format_reader->skipField(file_column++); } while (!format_reader->checkForEndOfRow()); } @@ -419,12 +418,14 @@ namespace void FormatWithNamesAndTypesSchemaReader::tryDetectHeader(std::vector & column_names, std::vector & type_names) { - auto [first_row_values, first_row_types] = readRowAndGetFieldsAndDataTypes(); + auto first_row = readRowAndGetFieldsAndDataTypes(); /// No data. - if (first_row_values.empty()) + if (!first_row) return; + auto [first_row_values, first_row_types] = *first_row; + /// The first row contains non String elements, it cannot be a header. if (!checkIfAllTypesAreString(first_row_types)) { @@ -432,15 +433,17 @@ void FormatWithNamesAndTypesSchemaReader::tryDetectHeader(std::vector & return; } - auto [second_row_values, second_row_types] = readRowAndGetFieldsAndDataTypes(); + auto second_row = readRowAndGetFieldsAndDataTypes(); /// Data contains only 1 row, don't treat it as a header. - if (second_row_values.empty()) + if (!second_row) { buffered_types = first_row_types; return; } + auto [second_row_values, second_row_types] = *second_row; + DataTypes data_types; bool second_row_can_be_type_names = checkIfAllTypesAreString(second_row_types) && checkIfAllValuesAreTypeNames(readNamesFromFields(second_row_values)); size_t row = 2; @@ -450,15 +453,16 @@ void FormatWithNamesAndTypesSchemaReader::tryDetectHeader(std::vector & } else { - data_types = readRowAndGetDataTypes(); + auto data_types_maybe = readRowAndGetDataTypes(); /// Data contains only 2 rows. - if (data_types.empty()) + if (!data_types_maybe) { second_row_can_be_type_names = false; data_types = second_row_types; } else { + data_types = *data_types_maybe; ++row; } } @@ -490,10 +494,10 @@ void FormatWithNamesAndTypesSchemaReader::tryDetectHeader(std::vector & return; } - auto next_row_types = readRowAndGetDataTypes(); + auto next_row_types_maybe = readRowAndGetDataTypes(); /// Check if there are no more rows in data. It means that all rows contains only String values and Nulls, /// so, the first two rows with all String elements can be real data and we cannot use them as a header. - if (next_row_types.empty()) + if (!next_row_types_maybe) { /// Buffer first data types from the first row, because it doesn't contain Nulls. buffered_types = first_row_types; @@ -502,11 +506,11 @@ void FormatWithNamesAndTypesSchemaReader::tryDetectHeader(std::vector & ++row; /// Combine types from current row and from previous rows. - chooseResultColumnTypes(*this, data_types, next_row_types, getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule::CSV), default_colum_names, row); + chooseResultColumnTypes(*this, data_types, *next_row_types_maybe, getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule::CSV), default_colum_names, row); } } -DataTypes FormatWithNamesAndTypesSchemaReader::readRowAndGetDataTypes() +std::optional FormatWithNamesAndTypesSchemaReader::readRowAndGetDataTypes() { /// Check if we tried to detect a header and have buffered types from read rows. if (!buffered_types.empty()) diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h index b5103d3db39..7b3e2cbea67 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h @@ -121,7 +121,7 @@ public: virtual bool checkForEndOfRow() { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method checkForEndOfRow is not implemented"); } - virtual bool allowVariableNumberOfColumns() { return false; } + virtual bool allowVariableNumberOfColumns() const { return false; } const FormatSettings & getFormatSettings() const { return format_settings; } @@ -160,15 +160,15 @@ public: NamesAndTypesList readSchema() override; protected: - virtual DataTypes readRowAndGetDataTypes() override; + virtual std::optional readRowAndGetDataTypes() override; - virtual DataTypes readRowAndGetDataTypesImpl() + virtual std::optional readRowAndGetDataTypesImpl() { throw Exception{ErrorCodes::NOT_IMPLEMENTED, "Method readRowAndGetDataTypesImpl is not implemented"}; } - /// Return column fields with inferred types. In case of no more rows, return empty vectors. - virtual std::pair, DataTypes> readRowAndGetFieldsAndDataTypes() + /// Return column fields with inferred types. In case of no more rows, return nullopt. + virtual std::optional, DataTypes>> readRowAndGetFieldsAndDataTypes() { throw Exception{ErrorCodes::NOT_IMPLEMENTED, "Method readRowAndGetFieldsAndDataTypes is not implemented"}; } diff --git a/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.reference b/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.reference new file mode 100644 index 00000000000..39d24f2cbd2 --- /dev/null +++ b/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.reference @@ -0,0 +1,52 @@ +CSV +1 1 +2 0 +0 0 +3 3 +1 1 \N \N +2 \N \N \N +\N \N \N \N +3 3 3 3 +1 1 +2 \N +\N \N +3 3 +TSV +1 1 +2 0 +0 0 +3 3 +1 1 \N \N +2 \N \N \N +\N \N \N \N +3 3 3 3 +1 1 +2 \N +\N \N +3 3 +JSONCompactEachRow +1 1 +2 0 +0 0 +3 3 +1 1 \N \N +2 \N \N \N +\N \N \N \N +3 3 3 3 +1 1 +2 \N +\N \N +3 3 +CustomSeparated +1 1 +2 0 +0 0 +3 3 +1 1 \N \N +2 \N \N \N +\N \N \N \N +3 3 3 3 +1 1 +2 \N +\N \N +3 3 diff --git a/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.sql b/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.sql new file mode 100644 index 00000000000..c0a80bf2114 --- /dev/null +++ b/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.sql @@ -0,0 +1,18 @@ +select 'CSV'; +select * from format(CSV, 'x UInt32, y UInt32', '1,1\n2\n\n3,3,3,3') settings input_format_csv_allow_variable_number_of_columns=1; +select * from format(CSV, '1,1\n2\n\n3,3,3,3') settings input_format_csv_allow_variable_number_of_columns=1; +select * from format(CSVWithNames, '"x","y"\n1,1\n2\n\n3,3,3,3') settings input_format_csv_allow_variable_number_of_columns=1; +select 'TSV'; +select * from format(TSV, 'x UInt32, y UInt32', '1\t1\n2\n\n3\t3\t3\t3') settings input_format_tsv_allow_variable_number_of_columns=1; +select * from format(TSV, '1\t1\n2\n\n3\t3\t3\t3') settings input_format_tsv_allow_variable_number_of_columns=1; +select * from format(TSVWithNames, 'x\ty\n1\t1\n2\n\n3\t3\t3\t3') settings input_format_tsv_allow_variable_number_of_columns=1; +select 'JSONCompactEachRow'; +select * from format(JSONCompactEachRow, 'x UInt32, y UInt32', '[1,1]\n[2]\n[]\n[3,3,3,3]') settings input_format_json_compact_allow_variable_number_of_columns=1; +select * from format(JSONCompactEachRow, '[1,1]\n[2]\n[]\n[3,3,3,3]') settings input_format_json_compact_allow_variable_number_of_columns=1; +select * from format(JSONCompactEachRowWithNames, '["x","y"]\n[1,1]\n[2]\n[]\n[3,3,3,3]') settings input_format_json_compact_allow_variable_number_of_columns=1; +select 'CustomSeparated'; +set format_custom_escaping_rule='CSV', format_custom_field_delimiter='', format_custom_row_before_delimiter='', format_custom_row_after_delimiter='', format_custom_row_between_delimiter='', format_custom_result_before_delimiter='', format_custom_result_after_delimiter=''; +select * from format(CustomSeparated, 'x UInt32, y UInt32', '1123333') settings input_format_custom_allow_variable_number_of_columns=1; +select * from format(CustomSeparated, '1123333') settings input_format_custom_allow_variable_number_of_columns=1; +select * from format(CustomSeparatedWithNames, '"x""y"1123333') settings input_format_custom_allow_variable_number_of_columns=1; + From c3c64a7dd50ee0f25dd94eb1d1b645e0352471ec Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 28 Jul 2023 11:40:05 +0000 Subject: [PATCH 2/4] Fix --- .../Impl/CustomSeparatedRowInputFormat.cpp | 5 ++++- .../Formats/RowInputFormatWithNamesAndTypes.cpp | 3 ++- ...ats_with_variable_number_of_columns.reference | 16 ++++++++++++++++ ...4_formats_with_variable_number_of_columns.sql | 4 ++++ 4 files changed, 26 insertions(+), 2 deletions(-) diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp index 8f8e12e3c2a..ff3d6d49199 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp @@ -139,7 +139,10 @@ void CustomSeparatedFormatReader::skipRowBetweenDelimiter() void CustomSeparatedFormatReader::skipField() { skipSpaces(); - skipFieldByEscapingRule(*buf, format_settings.custom.escaping_rule, format_settings); + if (format_settings.custom.escaping_rule == FormatSettings::EscapingRule::CSV) + readCSVFieldWithTwoPossibleDelimiters(*buf, format_settings.csv, format_settings.custom.field_delimiter, format_settings.custom.row_after_delimiter); + else + skipFieldByEscapingRule(*buf, format_settings.custom.escaping_rule, format_settings); } bool CustomSeparatedFormatReader::checkForEndOfRow() diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp index cb5c11e2d3b..4000bd14ddc 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp @@ -220,7 +220,8 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE while (file_column < column_mapping->column_indexes_for_input_fields.size()) { const auto & rem_column_index = column_mapping->column_indexes_for_input_fields[file_column]; - columns[*rem_column_index]->insertDefault(); + if (rem_column_index) + columns[*rem_column_index]->insertDefault(); ++file_column; } break; diff --git a/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.reference b/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.reference index 39d24f2cbd2..e9ff548e05c 100644 --- a/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.reference +++ b/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.reference @@ -11,6 +11,10 @@ CSV 2 \N \N \N 3 3 +1 0 +2 0 +0 0 +3 0 TSV 1 1 2 0 @@ -24,6 +28,10 @@ TSV 2 \N \N \N 3 3 +1 0 +2 0 +0 0 +3 0 JSONCompactEachRow 1 1 2 0 @@ -37,6 +45,10 @@ JSONCompactEachRow 2 \N \N \N 3 3 +1 0 +2 0 +0 0 +3 0 CustomSeparated 1 1 2 0 @@ -50,3 +62,7 @@ CustomSeparated 2 \N \N \N 3 3 +1 0 +2 0 +0 0 +3 0 diff --git a/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.sql b/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.sql index c0a80bf2114..dea4c20db8a 100644 --- a/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.sql +++ b/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.sql @@ -2,17 +2,21 @@ select 'CSV'; select * from format(CSV, 'x UInt32, y UInt32', '1,1\n2\n\n3,3,3,3') settings input_format_csv_allow_variable_number_of_columns=1; select * from format(CSV, '1,1\n2\n\n3,3,3,3') settings input_format_csv_allow_variable_number_of_columns=1; select * from format(CSVWithNames, '"x","y"\n1,1\n2\n\n3,3,3,3') settings input_format_csv_allow_variable_number_of_columns=1; +select * from format(CSVWithNames, 'x UInt32, z UInt32', '"x","y"\n1,1\n2\n\n3,3,3,3') settings input_format_csv_allow_variable_number_of_columns=1; select 'TSV'; select * from format(TSV, 'x UInt32, y UInt32', '1\t1\n2\n\n3\t3\t3\t3') settings input_format_tsv_allow_variable_number_of_columns=1; select * from format(TSV, '1\t1\n2\n\n3\t3\t3\t3') settings input_format_tsv_allow_variable_number_of_columns=1; select * from format(TSVWithNames, 'x\ty\n1\t1\n2\n\n3\t3\t3\t3') settings input_format_tsv_allow_variable_number_of_columns=1; +select * from format(TSVWithNames, 'x UInt32, z UInt32', 'x\ty\n1\t1\n2\n\n3\t3\t3\t3') settings input_format_tsv_allow_variable_number_of_columns=1; select 'JSONCompactEachRow'; select * from format(JSONCompactEachRow, 'x UInt32, y UInt32', '[1,1]\n[2]\n[]\n[3,3,3,3]') settings input_format_json_compact_allow_variable_number_of_columns=1; select * from format(JSONCompactEachRow, '[1,1]\n[2]\n[]\n[3,3,3,3]') settings input_format_json_compact_allow_variable_number_of_columns=1; select * from format(JSONCompactEachRowWithNames, '["x","y"]\n[1,1]\n[2]\n[]\n[3,3,3,3]') settings input_format_json_compact_allow_variable_number_of_columns=1; +select * from format(JSONCompactEachRowWithNames, 'x UInt32, z UInt32', '["x","y"]\n[1,1]\n[2]\n[]\n[3,3,3,3]') settings input_format_json_compact_allow_variable_number_of_columns=1; select 'CustomSeparated'; set format_custom_escaping_rule='CSV', format_custom_field_delimiter='', format_custom_row_before_delimiter='', format_custom_row_after_delimiter='', format_custom_row_between_delimiter='', format_custom_result_before_delimiter='', format_custom_result_after_delimiter=''; select * from format(CustomSeparated, 'x UInt32, y UInt32', '1123333') settings input_format_custom_allow_variable_number_of_columns=1; select * from format(CustomSeparated, '1123333') settings input_format_custom_allow_variable_number_of_columns=1; select * from format(CustomSeparatedWithNames, '"x""y"1123333') settings input_format_custom_allow_variable_number_of_columns=1; +select * from format(CustomSeparatedWithNames, 'x UInt32, z UInt32', '"x""y"1123333') settings input_format_custom_allow_variable_number_of_columns=1; From bb38918a263dd59307c463bf038ebf0c4d28d184 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 10 Aug 2023 13:21:11 +0200 Subject: [PATCH 3/4] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: János Benjamin Antal --- src/Processors/Formats/Impl/CSVRowInputFormat.cpp | 2 +- src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp | 2 +- src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 9092c7fceba..52f9571f962 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -417,7 +417,7 @@ std::optional, DataTypes>> CSVSchemaReader::readRo auto fields = reader.readRow(); auto data_types = tryInferDataTypesByEscapingRule(fields, format_settings, FormatSettings::EscapingRule::CSV); - return std::make_pair(fields, data_types); + return std::make_pair(std::move(fields), std::move(data_types)); } std::optional CSVSchemaReader::readRowAndGetDataTypesImpl() diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp index ff3d6d49199..17cc88425f5 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp @@ -388,7 +388,7 @@ std::optional, DataTypes>> CustomSeparatedSchemaRe auto fields = reader.readRow(); auto data_types = tryInferDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule(), &json_inference_info); - return std::make_pair(fields, data_types); + return std::make_pair(std::move(fields), std::move(data_types)); } std::optional CustomSeparatedSchemaReader::readRowAndGetDataTypesImpl() diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp index 4000bd14ddc..fc2b5cd8207 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp @@ -425,7 +425,7 @@ void FormatWithNamesAndTypesSchemaReader::tryDetectHeader(std::vector & if (!first_row) return; - auto [first_row_values, first_row_types] = *first_row; + const auto & [first_row_values, first_row_types] = *first_row; /// The first row contains non String elements, it cannot be a header. if (!checkIfAllTypesAreString(first_row_types)) @@ -443,7 +443,7 @@ void FormatWithNamesAndTypesSchemaReader::tryDetectHeader(std::vector & return; } - auto [second_row_values, second_row_types] = *second_row; + const auto & [second_row_values, second_row_types] = *second_row; DataTypes data_types; bool second_row_can_be_type_names = checkIfAllTypesAreString(second_row_types) && checkIfAllValuesAreTypeNames(readNamesFromFields(second_row_values)); From 82aff97dd04605233371c9c6de1e59933961cb78 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 10 Aug 2023 11:51:36 +0000 Subject: [PATCH 4/4] Add comment, more test --- src/Processors/Formats/RowInputFormatWithNamesAndTypes.h | 1 + ...2834_formats_with_variable_number_of_columns.reference | 8 ++++++++ .../02834_formats_with_variable_number_of_columns.sql | 2 ++ 3 files changed, 11 insertions(+) diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h index 7b3e2cbea67..377341da685 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h @@ -119,6 +119,7 @@ public: /// Check suffix. virtual bool checkForSuffix() { return in->eof(); } + /// Check if we are at the end of row, not between fields. virtual bool checkForEndOfRow() { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method checkForEndOfRow is not implemented"); } virtual bool allowVariableNumberOfColumns() const { return false; } diff --git a/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.reference b/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.reference index e9ff548e05c..50173c150c0 100644 --- a/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.reference +++ b/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.reference @@ -37,6 +37,14 @@ JSONCompactEachRow 2 0 0 0 3 3 +1 1 +2 0 +0 0 +3 3 +1 [1,2,3] +2 [] +0 [] +3 [3] 1 1 \N \N 2 \N \N \N \N \N \N \N diff --git a/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.sql b/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.sql index dea4c20db8a..7c55cf2e9a7 100644 --- a/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.sql +++ b/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.sql @@ -10,6 +10,8 @@ select * from format(TSVWithNames, 'x\ty\n1\t1\n2\n\n3\t3\t3\t3') settings input select * from format(TSVWithNames, 'x UInt32, z UInt32', 'x\ty\n1\t1\n2\n\n3\t3\t3\t3') settings input_format_tsv_allow_variable_number_of_columns=1; select 'JSONCompactEachRow'; select * from format(JSONCompactEachRow, 'x UInt32, y UInt32', '[1,1]\n[2]\n[]\n[3,3,3,3]') settings input_format_json_compact_allow_variable_number_of_columns=1; +select * from format(JSONCompactEachRow, 'x UInt32, y UInt32', '[1,1,[1,2,3]]\n[2]\n[]\n[3,3,3,3,[1,2,3]]') settings input_format_json_compact_allow_variable_number_of_columns=1; +select * from format(JSONCompactEachRow, 'x UInt32, y Array(UInt32)', '[1,[1,2,3],1]\n[2]\n[]\n[3,[3],3,3,[1,2,3]]') settings input_format_json_compact_allow_variable_number_of_columns=1; select * from format(JSONCompactEachRow, '[1,1]\n[2]\n[]\n[3,3,3,3]') settings input_format_json_compact_allow_variable_number_of_columns=1; select * from format(JSONCompactEachRowWithNames, '["x","y"]\n[1,1]\n[2]\n[]\n[3,3,3,3]') settings input_format_json_compact_allow_variable_number_of_columns=1; select * from format(JSONCompactEachRowWithNames, 'x UInt32, z UInt32', '["x","y"]\n[1,1]\n[2]\n[]\n[3,3,3,3]') settings input_format_json_compact_allow_variable_number_of_columns=1;