fixed bug : unexpected result when using in clause for filtering partitions

2024-11-25 00:52:02 +00:00 · 2022-02-28 16:47:50 +08:00 · 2022-02-28 16:47:50 +08:00 · 5885cfd869
commit 5885cfd869
parent db69ab9d17
4 changed files with 17 additions and 4 deletions
--- a/docker/test/integration/hive_server/prepare_hive_data.sh
+++ b/docker/test/integration/hive_server/prepare_hive_data.sh
@ -2,5 +2,8 @@
 hive -e "create database test"

 hive -e "create table test.demo(id string, score int) PARTITIONED BY(day string) ROW FORMAT SERDE   'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'  STORED AS INPUTFORMAT  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' OUTPUTFORMAT  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'; create table test.demo_orc(id string, score int) PARTITIONED BY(day string) ROW FORMAT SERDE   'org.apache.hadoop.hive.ql.io.orc.OrcSerde'  STORED AS INPUTFORMAT  'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' OUTPUTFORMAT  'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'; "
+hive -e "create table test.parquet_demo(id string, score int) PARTITIONED BY(day string, hour) ROW FORMAT SERDE   'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'  STORED AS INPUTFORMAT  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' OUTPUTFORMAT  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'"
 hive -e "create table test.demo_text(id string, score int, day string)row format delimited fields terminated by ','; load data local inpath '/demo_data.txt' into table test.demo_text "
- hive -e "set hive.exec.dynamic.partition.mode=nonstrict;insert into test.demo partition(day) select * from test.demo_text; insert into test.demo_orc partition(day) select * from test.demo_text"
+hive -e "set hive.exec.dynamic.partition.mode=nonstrict;insert into test.demo partition(day) select * from test.demo_text; insert into test.demo_orc partition(day) select * from test.demo_text"
+hive -e "set hive.exec.dynamic.partition.mode=nonstrict;insert into test.parquet_demo partition(day, hour) select id, score, day, '00' as hour from test.demo;"
+hive -e "set hive.exec.dynamic.partition.mode=nonstrict;insert into test.parquet_demo partition(day, hour) select id, score, day, '01' as hour from test.demo;"
--- a/docker/test/integration/runner/compose/docker_compose_hive.yml
+++ b/docker/test/integration/runner/compose/docker_compose_hive.yml
@ -1,7 +1,7 @@
 version: '2.3'
 services:
    hdfs1:
-        image: lgboustc/hive_test:v1.0
+        image: lgboustc/hive_test:v2.0
        hostname: hivetest
        restart: always
        entrypoint: bash /start.sh
--- a/src/Storages/Hive/StorageHive.h
+++ b/src/Storages/Hive/StorageHive.h
@ -36,7 +36,7 @@ public:
        ContextPtr /* query_context */,
        const StorageMetadataPtr & /* metadata_snapshot */) const override
    {
-        return false;
+        return true;
    }


--- a/tests/integration/test_hive_query/test.py
+++ b/tests/integration/test_hive_query/test.py
@ -30,7 +30,8 @@ def test_create_parquet_table(started_cluster):
    node.query("set input_format_parquet_allow_missing_columns = true")
    result = node.query("""
    DROP TABLE IF EXISTS default.demo_parquet;
-    CREATE TABLE default.demo_parquet (`id` Nullable(String), `score` Nullable(Int32), `day` Nullable(String)) ENGINE = Hive('thrift://hivetest:9083', 'test', 'demo') PARTITION BY(day)
+    CREATE TABLE default.demo_parquet (`id` Nullable(String), `score` Nullable(Int32), `day` Nullable(String)) ENGINE = Hive('thrift://hivetest:9083', 'test', 'demo') PARTITION BY(day);
+    CREATE TABLE default.demo_parquet_parts (`id` Nullable(String), `score` Nullable(Int32), `day` Nullable(String), `hour` String) ENGINE = Hive('thrift://hivetest:9083', 'test', 'parquet_demo') PARTITION BY(day, hour);
            """)
    logging.info("create result {}".format(result))
    time.sleep(120)
@ -70,6 +71,15 @@ def test_parquet_groupby(started_cluster):
 2021-11-16	2
 """
    assert result == expected_result
+
+def test_parquet_in_filter(started_cluster):
+    logging.info('Start testing groupby ...')
+    node = started_cluster.instances['h0_0_0']
+    result = node.query("""
+    SELECT day, count(*) FROM default.demo_parquet_parts where day = '2021-11-05' and hour in ('00')
+            """)
+    expected_result = """2021-11-05	2"""
+    assert result == expected_result
 def test_orc_groupby(started_cluster):
    logging.info('Start testing groupby ...')
    node = started_cluster.instances['h0_0_0']