Allow to skip some of the first rows in CSV/TSV formats

This commit is contained in:
avogar 2022-05-25 15:00:11 +00:00
parent b50d4549c9
commit 4c9812d4c1
9 changed files with 48 additions and 0 deletions

View File

@ -673,6 +673,8 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
M(Bool, input_format_json_read_bools_as_numbers, true, "Allow to parse bools as numbers in JSON input formats", 0) \
M(Bool, input_format_protobuf_flatten_google_wrappers, false, "Enable Google wrappers for regular non-nested columns, e.g. google.protobuf.StringValue 'str' for String column 'str'. For Nullable columns empty wrappers are recognized as defaults, and missing as nulls", 0) \
M(Bool, output_format_protobuf_nullables_with_google_wrappers, false, "When serializing Nullable columns with Google wrappers, serialize default values as empty wrappers. If turned off, default and null values are not serialized", 0) \
M(UInt64, input_format_csv_skip_first_lines, 0, "Skip specified amount of lines in the beginning of data in CSV format", 0) \
M(UInt64, input_format_tsv_skip_first_lines, 0, "Skip specified amount of lines in the beginning of data in TSV format", 0) \
\
M(DateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic', 'best_effort' and 'best_effort_us'.", 0) \
M(DateTimeOutputFormat, date_time_output_format, FormatSettings::DateTimeOutputFormat::Simple, "Method to write DateTime to text output. Possible values: 'simple', 'iso', 'unix_timestamp'.", 0) \

View File

@ -66,6 +66,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.csv.null_representation = settings.format_csv_null_representation;
format_settings.csv.input_format_arrays_as_nested_csv = settings.input_format_csv_arrays_as_nested_csv;
format_settings.csv.input_format_use_best_effort_in_schema_inference = settings.input_format_csv_use_best_effort_in_schema_inference;
format_settings.csv.skip_first_lines = settings.input_format_csv_skip_first_lines;
format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter;
@ -123,6 +124,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.tsv.input_format_enum_as_number = settings.input_format_tsv_enum_as_number;
format_settings.tsv.null_representation = settings.format_tsv_null_representation;
format_settings.tsv.input_format_use_best_effort_in_schema_inference = settings.input_format_tsv_use_best_effort_in_schema_inference;
format_settings.tsv.skip_first_lines = settings.input_format_tsv_skip_first_lines;
format_settings.values.accurate_types_of_literals = settings.input_format_values_accurate_types_of_literals;
format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions;
format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions;

View File

@ -109,6 +109,7 @@ struct FormatSettings
String null_representation = "\\N";
char tuple_delimiter = ',';
bool input_format_use_best_effort_in_schema_inference = true;
UInt64 skip_first_lines = 0;
} csv;
struct HiveText
@ -219,6 +220,7 @@ struct FormatSettings
String null_representation = "\\N";
bool input_format_enum_as_number = false;
bool input_format_use_best_effort_in_schema_inference = true;
UInt64 skip_first_lines = 0;
} tsv;
struct

View File

@ -259,6 +259,12 @@ bool CSVFormatReader::readField(
}
}
void CSVFormatReader::skipPrefixBeforeHeader()
{
for (size_t i = 0; i != format_settings.csv.skip_first_lines; ++i)
readRow();
}
CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_)
: FormatWithNamesAndTypesSchemaReader(

View File

@ -58,6 +58,7 @@ public:
void skipTypes() override { skipHeaderRow(); }
void skipFieldDelimiter() override;
void skipRowEndDelimiter() override;
void skipPrefixBeforeHeader() override;
std::vector<String> readNames() override { return readHeaderRow(); }
std::vector<String> readTypes() override { return readHeaderRow(); }

View File

@ -230,6 +230,12 @@ void TabSeparatedFormatReader::checkNullValueForNonNullable(DataTypePtr type)
}
}
void TabSeparatedFormatReader::skipPrefixBeforeHeader()
{
for (size_t i = 0; i != format_settings.csv.skip_first_lines; ++i)
readRow();
}
void TabSeparatedRowInputFormat::syncAfterError()
{
skipToUnescapedNextLineOrEOF(*in);

View File

@ -43,6 +43,7 @@ public:
void skipTypes() override { skipHeaderRow(); }
void skipFieldDelimiter() override;
void skipRowEndDelimiter() override;
void skipPrefixBeforeHeader() override;
std::vector<String> readRow();
std::vector<String> readNames() override { return readRow(); }

View File

@ -0,0 +1,16 @@
c1 Nullable(Float64)
c2 Nullable(Float64)
c3 Nullable(Float64)
0 1 2
1 2 3
2 3 4
3 4 5
4 5 6
c1 Nullable(Float64)
c2 Nullable(Float64)
c3 Nullable(Float64)
0 1 2
1 2 3
2 3 4
3 4 5
4 5 6

View File

@ -0,0 +1,12 @@
-- Tags: no-parallel
insert into function file(data_02314.csv) select number, number + 1 from numbers(5) settings engine_file_truncate_on_insert=1;
insert into function file(data_02314.csv) select number, number + 1, number + 2 from numbers(5);
desc file(data_02314.csv) settings input_format_csv_skip_first_lines=5;
select * from file(data_02314.csv) settings input_format_csv_skip_first_lines=5;
insert into function file(data_02314.tsv) select number, number + 1 from numbers(5) settings engine_file_truncate_on_insert=1;
insert into function file(data_02314.tsv) select number, number + 1, number + 2 from numbers(5);
desc file(data_02314.tsv) settings input_format_csv_skip_first_lines=5;
select * from file(data_02314.tsv) settings input_format_csv_skip_first_lines=5;