Merge pull request #62086 from KevinyhZou/improve_hive_text_read_by_replace_settings

Improve hive text read by allow variable number of fields
This commit is contained in:
Kruglov Pavel 2024-04-10 12:49:59 +00:00 committed by GitHub
commit 7a3bfb31e8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 22 additions and 3 deletions

View File

@ -1012,6 +1012,7 @@ class IColumn;
M(Char, input_format_hive_text_fields_delimiter, '\x01', "Delimiter between fields in Hive Text File", 0) \
M(Char, input_format_hive_text_collection_items_delimiter, '\x02', "Delimiter between collection(array or map) items in Hive Text File", 0) \
M(Char, input_format_hive_text_map_keys_delimiter, '\x03', "Delimiter between a pair of map key/values in Hive Text File", 0) \
M(Bool, input_format_hive_text_allow_variable_number_of_columns, true, "Ignore extra columns in Hive Text input (if file has more columns than expected) and treat missing fields in Hive Text input as default values", 0) \
M(UInt64, input_format_msgpack_number_of_columns, 0, "The number of columns in inserted MsgPack data. Used for automatic schema inference from data.", 0) \
M(MsgPackUUIDRepresentation, output_format_msgpack_uuid_representation, FormatSettings::MsgPackUUIDRepresentation::EXT, "The way how to output UUID in MsgPack format.", 0) \
M(UInt64, input_format_max_rows_to_read_for_schema_inference, 25000, "The maximum rows of data to read for automatic schema inference", 0) \

View File

@ -129,6 +129,7 @@ static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> sett
{"azure_max_upload_part_size", 5ull*1024*1024*1024, 5ull*1024*1024*1024, "The maximum size of part to upload during multipart upload to Azure blob storage."},
{"azure_upload_part_size_multiply_factor", 2, 2, "Multiply azure_min_upload_part_size by this factor each time azure_multiply_parts_count_threshold parts were uploaded from a single write to Azure blob storage."},
{"azure_upload_part_size_multiply_parts_count_threshold", 500, 500, "Each time this number of parts was uploaded to Azure blob storage, azure_min_upload_part_size is multiplied by azure_upload_part_size_multiply_factor."},
{"input_format_hive_text_allow_variable_number_of_columns", false, true, "Ignore extra columns in Hive Text input (if file has more columns than expected) and treat missing fields in Hive Text input as default values."},
}},
{"24.2", {{"allow_suspicious_variant_types", true, false, "Don't allow creating Variant type with suspicious variants by default"},
{"validate_experimental_and_suspicious_types_inside_nested_types", false, true, "Validate usage of experimental and suspicious types inside nested types"},

View File

@ -96,6 +96,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter;
format_settings.hive_text.allow_variable_number_of_columns = settings.input_format_hive_text_allow_variable_number_of_columns;
format_settings.custom.escaping_rule = settings.format_custom_escaping_rule;
format_settings.custom.field_delimiter = settings.format_custom_field_delimiter;
format_settings.custom.result_after_delimiter = settings.format_custom_result_after_delimiter;

View File

@ -176,6 +176,7 @@ struct FormatSettings
char fields_delimiter = '\x01';
char collection_items_delimiter = '\x02';
char map_keys_delimiter = '\x03';
bool allow_variable_number_of_columns = true;
Names input_field_names;
} hive_text{};

View File

@ -19,6 +19,7 @@ static FormatSettings updateFormatSettings(const FormatSettings & settings, cons
updated.date_time_input_format = FormatSettings::DateTimeInputFormat::BestEffort;
updated.defaults_for_omitted_fields = true;
updated.csv.delimiter = updated.hive_text.fields_delimiter;
updated.csv.allow_variable_number_of_columns = settings.hive_text.allow_variable_number_of_columns;
if (settings.hive_text.input_field_names.empty())
updated.hive_text.input_field_names = header.getNames();
return updated;
@ -44,9 +45,6 @@ HiveTextFormatReader::HiveTextFormatReader(PeekableReadBuffer & buf_, const Form
std::vector<String> HiveTextFormatReader::readNames()
{
PeekableReadBufferCheckpoint checkpoint{*buf, true};
auto values = readHeaderRow();
input_field_names.resize(values.size());
return input_field_names;
}

View File

@ -0,0 +1,2 @@
1 3 0
3 5 9

View File

@ -0,0 +1,13 @@
#!/usr/bin/env bash
# Tags: no-fasttest
# NOTE: this sh wrapper is required because of shell_config
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
$CLICKHOUSE_CLIENT -q "drop table if exists test_tbl"
$CLICKHOUSE_CLIENT -q "create table test_tbl (a UInt16, b UInt32, c UInt32) engine=MergeTree order by a"
$CLICKHOUSE_CLIENT -q "insert into test_tbl from infile '$CURDIR/data_hive/fields_number_variable.txt' SETTINGS input_format_hive_text_fields_delimiter=',' FORMAT HIVETEXT"
$CLICKHOUSE_CLIENT -q "select * from test_tbl"
$CLICKHOUSE_CLIENT -q "drop table test_tbl"

View File

@ -0,0 +1,2 @@
1,3
3,5,9