From 4c9812d4c1941ee48e1a2fcd260764160afa714e Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 25 May 2022 15:00:11 +0000 Subject: [PATCH] Allow to skip some of the first rows in CSV/TSV formats --- src/Core/Settings.h | 2 ++ src/Formats/FormatFactory.cpp | 2 ++ src/Formats/FormatSettings.h | 2 ++ .../Formats/Impl/CSVRowInputFormat.cpp | 6 ++++++ src/Processors/Formats/Impl/CSVRowInputFormat.h | 1 + .../Formats/Impl/TabSeparatedRowInputFormat.cpp | 6 ++++++ .../Formats/Impl/TabSeparatedRowInputFormat.h | 1 + .../02314_csv_tsv_skip_first_lines.reference | 16 ++++++++++++++++ .../02314_csv_tsv_skip_first_lines.sql | 12 ++++++++++++ 9 files changed, 48 insertions(+) create mode 100644 tests/queries/0_stateless/02314_csv_tsv_skip_first_lines.reference create mode 100644 tests/queries/0_stateless/02314_csv_tsv_skip_first_lines.sql diff --git a/src/Core/Settings.h b/src/Core/Settings.h index bf9785fcc00..4d1952ec028 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -673,6 +673,8 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(Bool, input_format_json_read_bools_as_numbers, true, "Allow to parse bools as numbers in JSON input formats", 0) \ M(Bool, input_format_protobuf_flatten_google_wrappers, false, "Enable Google wrappers for regular non-nested columns, e.g. google.protobuf.StringValue 'str' for String column 'str'. For Nullable columns empty wrappers are recognized as defaults, and missing as nulls", 0) \ M(Bool, output_format_protobuf_nullables_with_google_wrappers, false, "When serializing Nullable columns with Google wrappers, serialize default values as empty wrappers. If turned off, default and null values are not serialized", 0) \ + M(UInt64, input_format_csv_skip_first_lines, 0, "Skip specified amount of lines in the beginning of data in CSV format", 0) \ + M(UInt64, input_format_tsv_skip_first_lines, 0, "Skip specified amount of lines in the beginning of data in TSV format", 0) \ \ M(DateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic', 'best_effort' and 'best_effort_us'.", 0) \ M(DateTimeOutputFormat, date_time_output_format, FormatSettings::DateTimeOutputFormat::Simple, "Method to write DateTime to text output. Possible values: 'simple', 'iso', 'unix_timestamp'.", 0) \ diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 644e4d3ecfd..b2bec88340c 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -66,6 +66,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.csv.null_representation = settings.format_csv_null_representation; format_settings.csv.input_format_arrays_as_nested_csv = settings.input_format_csv_arrays_as_nested_csv; format_settings.csv.input_format_use_best_effort_in_schema_inference = settings.input_format_csv_use_best_effort_in_schema_inference; + format_settings.csv.skip_first_lines = settings.input_format_csv_skip_first_lines; format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter; format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter; format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter; @@ -123,6 +124,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.tsv.input_format_enum_as_number = settings.input_format_tsv_enum_as_number; format_settings.tsv.null_representation = settings.format_tsv_null_representation; format_settings.tsv.input_format_use_best_effort_in_schema_inference = settings.input_format_tsv_use_best_effort_in_schema_inference; + format_settings.tsv.skip_first_lines = settings.input_format_tsv_skip_first_lines; format_settings.values.accurate_types_of_literals = settings.input_format_values_accurate_types_of_literals; format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions; format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index e6f0a7d229e..eabfa2ad58b 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -109,6 +109,7 @@ struct FormatSettings String null_representation = "\\N"; char tuple_delimiter = ','; bool input_format_use_best_effort_in_schema_inference = true; + UInt64 skip_first_lines = 0; } csv; struct HiveText @@ -219,6 +220,7 @@ struct FormatSettings String null_representation = "\\N"; bool input_format_enum_as_number = false; bool input_format_use_best_effort_in_schema_inference = true; + UInt64 skip_first_lines = 0; } tsv; struct diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 0eaa02c97cb..bddd4203a5d 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -259,6 +259,12 @@ bool CSVFormatReader::readField( } } +void CSVFormatReader::skipPrefixBeforeHeader() +{ + for (size_t i = 0; i != format_settings.csv.skip_first_lines; ++i) + readRow(); +} + CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_) : FormatWithNamesAndTypesSchemaReader( diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.h b/src/Processors/Formats/Impl/CSVRowInputFormat.h index 91a872378c8..20a92c07830 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.h +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.h @@ -58,6 +58,7 @@ public: void skipTypes() override { skipHeaderRow(); } void skipFieldDelimiter() override; void skipRowEndDelimiter() override; + void skipPrefixBeforeHeader() override; std::vector readNames() override { return readHeaderRow(); } std::vector readTypes() override { return readHeaderRow(); } diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index 0be8257f463..877ba224fd5 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -230,6 +230,12 @@ void TabSeparatedFormatReader::checkNullValueForNonNullable(DataTypePtr type) } } +void TabSeparatedFormatReader::skipPrefixBeforeHeader() +{ + for (size_t i = 0; i != format_settings.csv.skip_first_lines; ++i) + readRow(); +} + void TabSeparatedRowInputFormat::syncAfterError() { skipToUnescapedNextLineOrEOF(*in); diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h index abab5b02c96..3476b974c3b 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h @@ -43,6 +43,7 @@ public: void skipTypes() override { skipHeaderRow(); } void skipFieldDelimiter() override; void skipRowEndDelimiter() override; + void skipPrefixBeforeHeader() override; std::vector readRow(); std::vector readNames() override { return readRow(); } diff --git a/tests/queries/0_stateless/02314_csv_tsv_skip_first_lines.reference b/tests/queries/0_stateless/02314_csv_tsv_skip_first_lines.reference new file mode 100644 index 00000000000..7d8e0c662cd --- /dev/null +++ b/tests/queries/0_stateless/02314_csv_tsv_skip_first_lines.reference @@ -0,0 +1,16 @@ +c1 Nullable(Float64) +c2 Nullable(Float64) +c3 Nullable(Float64) +0 1 2 +1 2 3 +2 3 4 +3 4 5 +4 5 6 +c1 Nullable(Float64) +c2 Nullable(Float64) +c3 Nullable(Float64) +0 1 2 +1 2 3 +2 3 4 +3 4 5 +4 5 6 diff --git a/tests/queries/0_stateless/02314_csv_tsv_skip_first_lines.sql b/tests/queries/0_stateless/02314_csv_tsv_skip_first_lines.sql new file mode 100644 index 00000000000..ff913a2a3ca --- /dev/null +++ b/tests/queries/0_stateless/02314_csv_tsv_skip_first_lines.sql @@ -0,0 +1,12 @@ +-- Tags: no-parallel + +insert into function file(data_02314.csv) select number, number + 1 from numbers(5) settings engine_file_truncate_on_insert=1; +insert into function file(data_02314.csv) select number, number + 1, number + 2 from numbers(5); +desc file(data_02314.csv) settings input_format_csv_skip_first_lines=5; +select * from file(data_02314.csv) settings input_format_csv_skip_first_lines=5; + +insert into function file(data_02314.tsv) select number, number + 1 from numbers(5) settings engine_file_truncate_on_insert=1; +insert into function file(data_02314.tsv) select number, number + 1, number + 2 from numbers(5); +desc file(data_02314.tsv) settings input_format_csv_skip_first_lines=5; +select * from file(data_02314.tsv) settings input_format_csv_skip_first_lines=5; +