mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 08:40:50 +00:00
Merge pull request #62086 from KevinyhZou/improve_hive_text_read_by_replace_settings
Improve hive text read by allow variable number of fields
This commit is contained in:
commit
7a3bfb31e8
@ -1012,6 +1012,7 @@ class IColumn;
|
||||
M(Char, input_format_hive_text_fields_delimiter, '\x01', "Delimiter between fields in Hive Text File", 0) \
|
||||
M(Char, input_format_hive_text_collection_items_delimiter, '\x02', "Delimiter between collection(array or map) items in Hive Text File", 0) \
|
||||
M(Char, input_format_hive_text_map_keys_delimiter, '\x03', "Delimiter between a pair of map key/values in Hive Text File", 0) \
|
||||
M(Bool, input_format_hive_text_allow_variable_number_of_columns, true, "Ignore extra columns in Hive Text input (if file has more columns than expected) and treat missing fields in Hive Text input as default values", 0) \
|
||||
M(UInt64, input_format_msgpack_number_of_columns, 0, "The number of columns in inserted MsgPack data. Used for automatic schema inference from data.", 0) \
|
||||
M(MsgPackUUIDRepresentation, output_format_msgpack_uuid_representation, FormatSettings::MsgPackUUIDRepresentation::EXT, "The way how to output UUID in MsgPack format.", 0) \
|
||||
M(UInt64, input_format_max_rows_to_read_for_schema_inference, 25000, "The maximum rows of data to read for automatic schema inference", 0) \
|
||||
|
@ -129,6 +129,7 @@ static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> sett
|
||||
{"azure_max_upload_part_size", 5ull*1024*1024*1024, 5ull*1024*1024*1024, "The maximum size of part to upload during multipart upload to Azure blob storage."},
|
||||
{"azure_upload_part_size_multiply_factor", 2, 2, "Multiply azure_min_upload_part_size by this factor each time azure_multiply_parts_count_threshold parts were uploaded from a single write to Azure blob storage."},
|
||||
{"azure_upload_part_size_multiply_parts_count_threshold", 500, 500, "Each time this number of parts was uploaded to Azure blob storage, azure_min_upload_part_size is multiplied by azure_upload_part_size_multiply_factor."},
|
||||
{"input_format_hive_text_allow_variable_number_of_columns", false, true, "Ignore extra columns in Hive Text input (if file has more columns than expected) and treat missing fields in Hive Text input as default values."},
|
||||
}},
|
||||
{"24.2", {{"allow_suspicious_variant_types", true, false, "Don't allow creating Variant type with suspicious variants by default"},
|
||||
{"validate_experimental_and_suspicious_types_inside_nested_types", false, true, "Validate usage of experimental and suspicious types inside nested types"},
|
||||
|
@ -96,6 +96,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
|
||||
format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
|
||||
format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
|
||||
format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter;
|
||||
format_settings.hive_text.allow_variable_number_of_columns = settings.input_format_hive_text_allow_variable_number_of_columns;
|
||||
format_settings.custom.escaping_rule = settings.format_custom_escaping_rule;
|
||||
format_settings.custom.field_delimiter = settings.format_custom_field_delimiter;
|
||||
format_settings.custom.result_after_delimiter = settings.format_custom_result_after_delimiter;
|
||||
|
@ -176,6 +176,7 @@ struct FormatSettings
|
||||
char fields_delimiter = '\x01';
|
||||
char collection_items_delimiter = '\x02';
|
||||
char map_keys_delimiter = '\x03';
|
||||
bool allow_variable_number_of_columns = true;
|
||||
Names input_field_names;
|
||||
} hive_text{};
|
||||
|
||||
|
@ -19,6 +19,7 @@ static FormatSettings updateFormatSettings(const FormatSettings & settings, cons
|
||||
updated.date_time_input_format = FormatSettings::DateTimeInputFormat::BestEffort;
|
||||
updated.defaults_for_omitted_fields = true;
|
||||
updated.csv.delimiter = updated.hive_text.fields_delimiter;
|
||||
updated.csv.allow_variable_number_of_columns = settings.hive_text.allow_variable_number_of_columns;
|
||||
if (settings.hive_text.input_field_names.empty())
|
||||
updated.hive_text.input_field_names = header.getNames();
|
||||
return updated;
|
||||
@ -44,9 +45,6 @@ HiveTextFormatReader::HiveTextFormatReader(PeekableReadBuffer & buf_, const Form
|
||||
|
||||
std::vector<String> HiveTextFormatReader::readNames()
|
||||
{
|
||||
PeekableReadBufferCheckpoint checkpoint{*buf, true};
|
||||
auto values = readHeaderRow();
|
||||
input_field_names.resize(values.size());
|
||||
return input_field_names;
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,2 @@
|
||||
1 3 0
|
||||
3 5 9
|
13
tests/queries/0_stateless/03033_hive_text_read_variable_fields.sh
Executable file
13
tests/queries/0_stateless/03033_hive_text_read_variable_fields.sh
Executable file
@ -0,0 +1,13 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-fasttest
|
||||
# NOTE: this sh wrapper is required because of shell_config
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "drop table if exists test_tbl"
|
||||
$CLICKHOUSE_CLIENT -q "create table test_tbl (a UInt16, b UInt32, c UInt32) engine=MergeTree order by a"
|
||||
$CLICKHOUSE_CLIENT -q "insert into test_tbl from infile '$CURDIR/data_hive/fields_number_variable.txt' SETTINGS input_format_hive_text_fields_delimiter=',' FORMAT HIVETEXT"
|
||||
$CLICKHOUSE_CLIENT -q "select * from test_tbl"
|
||||
$CLICKHOUSE_CLIENT -q "drop table test_tbl"
|
@ -0,0 +1,2 @@
|
||||
1,3
|
||||
3,5,9
|
Loading…
Reference in New Issue
Block a user