add tests for hive null_as_default

2024-12-04 13:32:13 +00:00 · 2023-05-04 14:40:22 +08:00 · 2023-05-04 14:40:22 +08:00 · 5c67c5fd26
commit 5c67c5fd26
parent 9a67616791
3 changed files with 33 additions and 2 deletions
--- a/src/Processors/Formats/Impl/HiveTextRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/HiveTextRowInputFormat.cpp
@ -17,6 +17,7 @@ static FormatSettings updateFormatSettings(const FormatSettings & settings, cons
    updated.skip_unknown_fields = true;
    updated.with_names_use_header = true;
    updated.date_time_input_format = FormatSettings::DateTimeInputFormat::BestEffort;
+    updated.defaults_for_omitted_fields = true;
    updated.csv.delimiter = updated.hive_text.fields_delimiter;
    if (settings.hive_text.input_field_names.empty())
        updated.hive_text.input_field_names = header.getNames();
--- a/tests/integration/test_hive_query/data/prepare_hive_data.sh
+++ b/tests/integration/test_hive_query/data/prepare_hive_data.sh
@ -5,11 +5,13 @@ hive -e "drop table if exists test.demo; create table test.demo(id string, score
 hive -e "drop table if exists test.parquet_demo; create table test.parquet_demo(id string, score int) PARTITIONED BY(day string, hour string) ROW FORMAT SERDE   'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'  STORED AS INPUTFORMAT  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' OUTPUTFORMAT  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'"
 hive -e "drop table if exists test.demo_text; create table test.demo_text(id string, score int, day string)row format delimited fields terminated by ','; load data local inpath '/demo_data.txt' into table test.demo_text "
 hive -e "set hive.exec.dynamic.partition.mode=nonstrict;insert into test.demo partition(day) select * from test.demo_text; insert into test.demo_orc partition(day) select * from test.demo_text"
-
 hive -e "set hive.exec.dynamic.partition.mode=nonstrict;insert into test.parquet_demo partition(day, hour) select id, score, day, '00' as hour from test.demo;"
 hive -e "set hive.exec.dynamic.partition.mode=nonstrict;insert into test.parquet_demo partition(day, hour) select id, score, day, '01' as hour from test.demo;"

 hive -e "drop table if exists test.test_hive_types; CREATE TABLE test.test_hive_types( f_tinyint tinyint, f_smallint smallint, f_int int, f_integer int, f_bigint bigint, f_float float, f_double double, f_decimal decimal(10,0), f_timestamp timestamp, f_date date, f_string string, f_varchar varchar(100), f_char char(100), f_bool boolean, f_array_int array<int>, f_array_string array<string>, f_array_float array<float>, f_map_int map<string, int>, f_map_string map<string, string>, f_map_float map<string, float>, f_struct struct<a:string, b:int, c:float, d: struct<x:int, y:string>>) PARTITIONED BY( day string) ROW FORMAT SERDE   'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'  STORED AS INPUTFORMAT  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' OUTPUTFORMAT  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat';"
-
 hive -e "insert into test.test_hive_types partition(day='2022-02-20') select 1, 2, 3, 4, 5, 6.11, 7.22, 8.333, '2022-02-20 14:47:04', '2022-02-20', 'hello world', 'hello world', 'hello world', true, array(1,2,3), array('hello world', 'hello world'), array(float(1.1),float(1.2)), map('a', 100, 'b', 200, 'c', 300), map('a', 'aa', 'b', 'bb', 'c', 'cc'), map('a', float(111.1), 'b', float(222.2), 'c', float(333.3)), named_struct('a', 'aaa', 'b', 200, 'c', float(333.3), 'd', named_struct('x', 10, 'y', 'xyz')); insert into test.test_hive_types partition(day='2022-02-19') select 1, 2, 3, 4, 5, 6.11, 7.22, 8.333, '2022-02-19 14:47:04', '2022-02-19', 'hello world', 'hello world', 'hello world', true, array(1,2,3), array('hello world', 'hello world'), array(float(1.1),float(1.2)), map('a', 100, 'b', 200, 'c', 300), map('a', 'aa', 'b', 'bb', 'c', 'cc'), map('a', float(111.1), 'b', float(222.2), 'c', float(333.3)), named_struct('a', 'aaa', 'b', 200, 'c', float(333.3), 'd', named_struct('x', 11, 'y', 'abc'));"

+hive -e "drop table if exists test.null_as_default_orc; create table test.null_as_default_orc (x string, y string) PARTITIONED BY(day string) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'"
+hive -e "insert into test.null_as_default_orc partition(day='2023-05-01') select null, null;"
+hive -e "drop table if exists test.null_as_default_parquet; create table test.null_as_default_parquet (x string, y string) PARTITIONED BY(day string) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'"
+hive -e "set hive.exec.dynamic.partition.mode=nonstrict; insert into test.null_as_default_parquet partition(day) select x, y, day from test.null_as_default_orc;"
--- a/tests/integration/test_hive_query/test.py
+++ b/tests/integration/test_hive_query/test.py
@ -496,3 +496,31 @@ CREATE TABLE IF NOT EXISTS default.demo_parquet_1 (`id` Nullable(String), `score
    result = node.query("""DESC default.demo_parquet_1 FORMAT TSV""")
    expected_result = """id\tNullable(String)\t\t\tText comment\t\t\nscore\tNullable(Int32)\t\t\t\t\t\nday\tNullable(String)"""
    assert result.strip() == expected_result
+
+
+@pytest.mark.parametrize(
+    "table",
+    [
+        pytest.param(
+            "null_as_default_orc",
+            id="test_null_as_default_orc",
+        ),
+        pytest.param(
+            "null_as_default_parquet",
+            id="test_null_as_default_parquet",
+        ),
+    ],
+)
+def test_null_as_default(started_cluster, table):
+    node = started_cluster.instances["h0_0_0"]
+    node.query("set input_format_null_as_default = true")
+    result = node.query(
+        """
+DROP TABLE IF EXISTS default.${table};
+CREATE TABLE default.${table} (`x` String, `y` String DEFAULT 'world', `day` String) ENGINE = Hive('thrift://hivetest:9083', 'test', '${table}') PARTITION BY(day);
+select x, y from default.${table};
+""".format(
+            table=table
+        )
+    )
+    assert result.strip("\n") == "\tworld"