Merge pull request #62086 from KevinyhZou/improve_hive_text_read_by_replace_settings

Improve hive text read by allow variable number of fields
2024-09-20 08:40:50 +00:00 · 2024-04-10 12:49:59 +00:00 · 2024-04-10 12:49:59 +00:00 · 7a3bfb31e8
commit 7a3bfb31e8
parent a2154418ee 44b3ce9ec8
8 changed files with 22 additions and 3 deletions
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -1012,6 +1012,7 @@ class IColumn;
    M(Char, input_format_hive_text_fields_delimiter, '\x01', "Delimiter between fields in Hive Text File", 0) \
    M(Char, input_format_hive_text_collection_items_delimiter, '\x02', "Delimiter between collection(array or map) items in Hive Text File", 0) \
    M(Char, input_format_hive_text_map_keys_delimiter, '\x03', "Delimiter between a pair of map key/values in Hive Text File", 0) \
+    M(Bool, input_format_hive_text_allow_variable_number_of_columns, true, "Ignore extra columns in Hive Text input (if file has more columns than expected) and treat missing fields in Hive Text input as default values", 0) \
    M(UInt64, input_format_msgpack_number_of_columns, 0, "The number of columns in inserted MsgPack data. Used for automatic schema inference from data.", 0) \
    M(MsgPackUUIDRepresentation, output_format_msgpack_uuid_representation, FormatSettings::MsgPackUUIDRepresentation::EXT, "The way how to output UUID in MsgPack format.", 0) \
    M(UInt64, input_format_max_rows_to_read_for_schema_inference, 25000, "The maximum rows of data to read for automatic schema inference", 0) \
--- a/src/Core/SettingsChangesHistory.h
+++ b/src/Core/SettingsChangesHistory.h
@ -129,6 +129,7 @@ static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> sett
              {"azure_max_upload_part_size", 5ull*1024*1024*1024, 5ull*1024*1024*1024, "The maximum size of part to upload during multipart upload to Azure blob storage."},
              {"azure_upload_part_size_multiply_factor", 2, 2, "Multiply azure_min_upload_part_size by this factor each time azure_multiply_parts_count_threshold parts were uploaded from a single write to Azure blob storage."},
              {"azure_upload_part_size_multiply_parts_count_threshold", 500, 500, "Each time this number of parts was uploaded to Azure blob storage, azure_min_upload_part_size is multiplied by azure_upload_part_size_multiply_factor."},
+              {"input_format_hive_text_allow_variable_number_of_columns", false, true, "Ignore extra columns in Hive Text input (if file has more columns than expected) and treat missing fields in Hive Text input as default values."},
              }},
    {"24.2", {{"allow_suspicious_variant_types", true, false, "Don't allow creating Variant type with suspicious variants by default"},
              {"validate_experimental_and_suspicious_types_inside_nested_types", false, true, "Validate usage of experimental and suspicious types inside nested types"},
--- a/src/Formats/FormatFactory.cpp
+++ b/src/Formats/FormatFactory.cpp
@ -96,6 +96,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
    format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
    format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
    format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter;
+    format_settings.hive_text.allow_variable_number_of_columns = settings.input_format_hive_text_allow_variable_number_of_columns;
    format_settings.custom.escaping_rule = settings.format_custom_escaping_rule;
    format_settings.custom.field_delimiter = settings.format_custom_field_delimiter;
    format_settings.custom.result_after_delimiter = settings.format_custom_result_after_delimiter;
--- a/src/Formats/FormatSettings.h
+++ b/src/Formats/FormatSettings.h
@ -176,6 +176,7 @@ struct FormatSettings
        char fields_delimiter = '\x01';
        char collection_items_delimiter = '\x02';
        char map_keys_delimiter = '\x03';
+        bool allow_variable_number_of_columns = true;
        Names input_field_names;
    } hive_text{};

--- a/src/Processors/Formats/Impl/HiveTextRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/HiveTextRowInputFormat.cpp
@ -19,6 +19,7 @@ static FormatSettings updateFormatSettings(const FormatSettings & settings, cons
    updated.date_time_input_format = FormatSettings::DateTimeInputFormat::BestEffort;
    updated.defaults_for_omitted_fields = true;
    updated.csv.delimiter = updated.hive_text.fields_delimiter;
+    updated.csv.allow_variable_number_of_columns = settings.hive_text.allow_variable_number_of_columns;
    if (settings.hive_text.input_field_names.empty())
        updated.hive_text.input_field_names = header.getNames();
    return updated;
@ -44,9 +45,6 @@ HiveTextFormatReader::HiveTextFormatReader(PeekableReadBuffer & buf_, const Form

 std::vector<String> HiveTextFormatReader::readNames()
 {
-    PeekableReadBufferCheckpoint checkpoint{*buf, true};
-    auto values = readHeaderRow();
-    input_field_names.resize(values.size());
    return input_field_names;
 }

--- a/tests/queries/0_stateless/03033_hive_text_read_variable_fields.reference
+++ b/tests/queries/0_stateless/03033_hive_text_read_variable_fields.reference
@ -0,0 +1,2 @@
+1	3	0
+3	5	9
--- a/tests/queries/0_stateless/03033_hive_text_read_variable_fields.sh
+++ b/tests/queries/0_stateless/03033_hive_text_read_variable_fields.sh
@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+# Tags: no-fasttest
+# NOTE: this sh wrapper is required because of shell_config
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+$CLICKHOUSE_CLIENT -q "drop table if exists test_tbl"
+$CLICKHOUSE_CLIENT -q "create table test_tbl (a UInt16, b UInt32, c UInt32) engine=MergeTree order by a"
+$CLICKHOUSE_CLIENT -q "insert into test_tbl from infile '$CURDIR/data_hive/fields_number_variable.txt' SETTINGS input_format_hive_text_fields_delimiter=',' FORMAT HIVETEXT"
+$CLICKHOUSE_CLIENT -q "select * from test_tbl"
+$CLICKHOUSE_CLIENT -q "drop table test_tbl"
--- a/tests/queries/0_stateless/data_hive/fields_number_variable.txt
+++ b/tests/queries/0_stateless/data_hive/fields_number_variable.txt
@ -0,0 +1,2 @@
+1,3
+3,5,9