Allow to skip some of the first rows in CSV/TSV formats

2024-11-23 16:12:01 +00:00 · 2022-05-25 15:00:11 +00:00 · 2022-05-25 15:00:11 +00:00 · 4c9812d4c1
commit 4c9812d4c1
parent b50d4549c9
9 changed files with 48 additions and 0 deletions
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -673,6 +673,8 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(Bool, input_format_json_read_bools_as_numbers, true, "Allow to parse bools as numbers in JSON input formats", 0) \
    M(Bool, input_format_protobuf_flatten_google_wrappers, false, "Enable Google wrappers for regular non-nested columns, e.g. google.protobuf.StringValue 'str' for String column 'str'. For Nullable columns empty wrappers are recognized as defaults, and missing as nulls", 0) \
    M(Bool, output_format_protobuf_nullables_with_google_wrappers, false, "When serializing Nullable columns with Google wrappers, serialize default values as empty wrappers. If turned off, default and null values are not serialized", 0) \
+    M(UInt64, input_format_csv_skip_first_lines, 0, "Skip specified amount of lines in the beginning of data in CSV format", 0) \
+    M(UInt64, input_format_tsv_skip_first_lines, 0, "Skip specified amount of lines in the beginning of data in TSV format", 0) \
    \
    M(DateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic', 'best_effort' and 'best_effort_us'.", 0) \
    M(DateTimeOutputFormat, date_time_output_format, FormatSettings::DateTimeOutputFormat::Simple, "Method to write DateTime to text output. Possible values: 'simple', 'iso', 'unix_timestamp'.", 0) \
--- a/src/Formats/FormatFactory.cpp
+++ b/src/Formats/FormatFactory.cpp
@ -66,6 +66,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
    format_settings.csv.null_representation = settings.format_csv_null_representation;
    format_settings.csv.input_format_arrays_as_nested_csv = settings.input_format_csv_arrays_as_nested_csv;
    format_settings.csv.input_format_use_best_effort_in_schema_inference = settings.input_format_csv_use_best_effort_in_schema_inference;
+    format_settings.csv.skip_first_lines = settings.input_format_csv_skip_first_lines;
    format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
    format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
    format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter;
@ -123,6 +124,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
    format_settings.tsv.input_format_enum_as_number = settings.input_format_tsv_enum_as_number;
    format_settings.tsv.null_representation = settings.format_tsv_null_representation;
    format_settings.tsv.input_format_use_best_effort_in_schema_inference = settings.input_format_tsv_use_best_effort_in_schema_inference;
+    format_settings.tsv.skip_first_lines = settings.input_format_tsv_skip_first_lines;
    format_settings.values.accurate_types_of_literals = settings.input_format_values_accurate_types_of_literals;
    format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions;
    format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions;
--- a/src/Formats/FormatSettings.h
+++ b/src/Formats/FormatSettings.h
@ -109,6 +109,7 @@ struct FormatSettings
        String null_representation = "\\N";
        char tuple_delimiter = ',';
        bool input_format_use_best_effort_in_schema_inference = true;
+        UInt64 skip_first_lines = 0;
    } csv;

    struct HiveText
@ -219,6 +220,7 @@ struct FormatSettings
        String null_representation = "\\N";
        bool input_format_enum_as_number = false;
        bool input_format_use_best_effort_in_schema_inference = true;
+        UInt64 skip_first_lines = 0;
    } tsv;

    struct
--- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp
@ -259,6 +259,12 @@ bool CSVFormatReader::readField(
    }
 }

+void CSVFormatReader::skipPrefixBeforeHeader()
+{
+    for (size_t i = 0; i != format_settings.csv.skip_first_lines; ++i)
+        readRow();
+}
+

 CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_)
    : FormatWithNamesAndTypesSchemaReader(
--- a/src/Processors/Formats/Impl/CSVRowInputFormat.h
+++ b/src/Processors/Formats/Impl/CSVRowInputFormat.h
@ -58,6 +58,7 @@ public:
    void skipTypes() override { skipHeaderRow(); }
    void skipFieldDelimiter() override;
    void skipRowEndDelimiter() override;
+    void skipPrefixBeforeHeader() override;

    std::vector<String> readNames() override { return readHeaderRow(); }
    std::vector<String> readTypes() override { return readHeaderRow(); }
--- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp
@ -230,6 +230,12 @@ void TabSeparatedFormatReader::checkNullValueForNonNullable(DataTypePtr type)
    }
 }

+void TabSeparatedFormatReader::skipPrefixBeforeHeader()
+{
+    for (size_t i = 0; i != format_settings.csv.skip_first_lines; ++i)
+        readRow();
+}
+
 void TabSeparatedRowInputFormat::syncAfterError()
 {
    skipToUnescapedNextLineOrEOF(*in);
--- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h
+++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h
@ -43,6 +43,7 @@ public:
    void skipTypes() override { skipHeaderRow(); }
    void skipFieldDelimiter() override;
    void skipRowEndDelimiter() override;
+    void skipPrefixBeforeHeader() override;

    std::vector<String> readRow();
    std::vector<String> readNames() override { return readRow(); }
--- a/tests/queries/0_stateless/02314_csv_tsv_skip_first_lines.reference
+++ b/tests/queries/0_stateless/02314_csv_tsv_skip_first_lines.reference
@ -0,0 +1,16 @@
+c1	Nullable(Float64)					
+c2	Nullable(Float64)					
+c3	Nullable(Float64)					
+0	1	2
+1	2	3
+2	3	4
+3	4	5
+4	5	6
+c1	Nullable(Float64)					
+c2	Nullable(Float64)					
+c3	Nullable(Float64)					
+0	1	2
+1	2	3
+2	3	4
+3	4	5
+4	5	6
--- a/tests/queries/0_stateless/02314_csv_tsv_skip_first_lines.sql
+++ b/tests/queries/0_stateless/02314_csv_tsv_skip_first_lines.sql
@ -0,0 +1,12 @@
+-- Tags: no-parallel
+
+insert into function file(data_02314.csv) select number, number + 1 from numbers(5) settings engine_file_truncate_on_insert=1;
+insert into function file(data_02314.csv) select number, number + 1, number + 2 from numbers(5);
+desc file(data_02314.csv) settings input_format_csv_skip_first_lines=5;
+select * from file(data_02314.csv) settings input_format_csv_skip_first_lines=5;
+
+insert into function file(data_02314.tsv) select number, number + 1 from numbers(5) settings engine_file_truncate_on_insert=1;
+insert into function file(data_02314.tsv) select number, number + 1, number + 2 from numbers(5);
+desc file(data_02314.tsv) settings input_format_csv_skip_first_lines=5;
+select * from file(data_02314.tsv) settings input_format_csv_skip_first_lines=5;
+