import logging import os import time import pytest from helpers.cluster import ClickHouseCluster from helpers.test_tools import TSV logging.getLogger().setLevel(logging.INFO) logging.getLogger().addHandler(logging.StreamHandler()) SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) @pytest.fixture(scope="module") def started_cluster(): try: cluster = ClickHouseCluster(__file__) cluster.add_instance( "h0_0_0", main_configs=["configs/config.xml"], extra_configs=["configs/hdfs-site.xml"], with_hive=True, ) logging.info("Starting cluster ...") cluster.start() yield cluster finally: cluster.shutdown() def test_create_parquet_table(started_cluster): logging.info("Start testing creating hive table ...") node = started_cluster.instances["h0_0_0"] test_passed = False for i in range(10): node.query("set input_format_parquet_allow_missing_columns = true") result = node.query( """ DROP TABLE IF EXISTS default.demo_parquet; CREATE TABLE default.demo_parquet (`id` Nullable(String), `score` Nullable(Int32), `day` Nullable(String)) ENGINE = Hive('thrift://hivetest:9083', 'test', 'demo') PARTITION BY(day) """ ) logging.info("create result {}".format(result)) if result.strip() == "": test_passed = True break time.sleep(60) assert test_passed def test_create_parquet_table_1(started_cluster): logging.info("Start testing creating hive table ...") node = started_cluster.instances["h0_0_0"] for i in range(10): node.query("set input_format_parquet_allow_missing_columns = true") result = node.query( """ DROP TABLE IF EXISTS default.demo_parquet_parts; CREATE TABLE default.demo_parquet_parts (`id` Nullable(String), `score` Nullable(Int32), `day` Nullable(String), `hour` String) ENGINE = Hive('thrift://hivetest:9083', 'test', 'parquet_demo') PARTITION BY(day, hour); """ ) logging.info("create result {}".format(result)) if result.strip() == "": test_passed = True break time.sleep(60) assert test_passed def test_create_orc_table(started_cluster): logging.info("Start testing creating hive table ...") node = started_cluster.instances["h0_0_0"] test_passed = False for i in range(10): result = node.query( """ DROP TABLE IF EXISTS default.demo_orc; CREATE TABLE default.demo_orc (`id` Nullable(String), `score` Nullable(Int32), `day` Nullable(String)) ENGINE = Hive('thrift://hivetest:9083', 'test', 'demo_orc') PARTITION BY(day) """ ) logging.info("create result {}".format(result)) if result.strip() == "": test_passed = True break time.sleep(60) assert test_passed def test_create_text_table(started_cluster): logging.info("Start testing creating hive table ...") node = started_cluster.instances["h0_0_0"] result = node.query( """ DROP TABLE IF EXISTS default.demo_text; CREATE TABLE default.demo_text (`id` Nullable(String), `score` Nullable(Int32), `day` Nullable(String)) ENGINE = Hive('thrift://hivetest:9083', 'test', 'demo_text') PARTITION BY (tuple()) """ ) logging.info("create result {}".format(result)) assert result.strip() == "" def test_parquet_groupby(started_cluster): logging.info("Start testing groupby ...") node = started_cluster.instances["h0_0_0"] result = node.query( """ SELECT day, count(*) FROM default.demo_parquet group by day order by day """ ) expected_result = """2021-11-01 1 2021-11-05 2 2021-11-11 1 2021-11-16 2 """ assert result == expected_result def test_parquet_in_filter(started_cluster): logging.info("Start testing groupby ...") node = started_cluster.instances["h0_0_0"] result = node.query( """ SELECT count(*) FROM default.demo_parquet_parts where day = '2021-11-05' and hour in ('00') """ ) expected_result = """2 """ logging.info("query result:{}".format(result)) assert result == expected_result def test_orc_groupby(started_cluster): logging.info("Start testing groupby ...") node = started_cluster.instances["h0_0_0"] result = node.query( """ SELECT day, count(*) FROM default.demo_orc group by day order by day """ ) expected_result = """2021-11-01 1 2021-11-05 2 2021-11-11 1 2021-11-16 2 """ assert result == expected_result @pytest.mark.parametrize( "table,use_local_cache_for_remote_storage,enable_orc_file_minmax_index,enable_orc_stripe_minmax_index", [ pytest.param( "demo_orc_no_cache_no_index", "false", "false", "false", id="demo_orc_no_cache_no_index", ), pytest.param( "demo_orc_with_cache_no_index", "true", "false", "false", id="demo_orc_with_cache_no_index", ), pytest.param( "demo_orc_no_cache_file_index", "false", "true", "false", id="demo_orc_no_cache_file_index", ), pytest.param( "demo_orc_with_cache_file_index", "true", "true", "false", id="demo_orc_with_cache_file_index", ), pytest.param( "demo_orc_no_cache_stripe_index", "false", "true", "true", id="demo_orc_no_cache_stripe_index", ), pytest.param( "demo_orc_with_cache_stripe_index", "true", "true", "true", id="demo_orc_with_cache_stripe_index", ), ], ) def test_orc_minmax_index( started_cluster, table, use_local_cache_for_remote_storage, enable_orc_file_minmax_index, enable_orc_stripe_minmax_index, ): node = started_cluster.instances["h0_0_0"] result = node.query( """ DROP TABLE IF EXISTS default.{table}; CREATE TABLE default.{table} (`id` Nullable(String), `score` Nullable(Int32), `day` Nullable(String)) ENGINE = Hive('thrift://hivetest:9083', 'test', 'demo_orc') PARTITION BY(day) SETTINGS enable_orc_file_minmax_index = {enable_orc_file_minmax_index}, enable_orc_stripe_minmax_index = {enable_orc_stripe_minmax_index}; """.format( table=table, enable_orc_file_minmax_index=enable_orc_file_minmax_index, enable_orc_stripe_minmax_index=enable_orc_stripe_minmax_index, ) ) assert result.strip() == "" for i in range(2): result = node.query( """ SELECT day, id, score FROM default.{table} where day >= '2021-11-05' and day <= '2021-11-16' and score >= 15 and score <= 30 order by day, id SETTINGS use_local_cache_for_remote_storage = {use_local_cache_for_remote_storage} """.format( table=table, use_local_cache_for_remote_storage=use_local_cache_for_remote_storage, ) ) assert ( result == """2021-11-05 abd 15 2021-11-16 aaa 22 """ ) @pytest.mark.parametrize( "table,use_local_cache_for_remote_storage,enable_parquet_rowgroup_minmax_index", [ pytest.param( "demo_parquet_no_cache_no_index", "false", "false", id="demo_parquet_no_cache_no_index", ), pytest.param( "demo_parquet_with_cache_no_index", "true", "false", id="demo_parquet_with_cache_no_index", ), pytest.param( "demo_parquet_no_cache_rowgroup_index", "false", "true", id="demo_parquet_no_cache_rowgroup_index", ), pytest.param( "demo_parquet_with_cache_rowgroup_index", "true", "true", id="demo_parquet_with_cache_rowgroup_index", ), ], ) def test_parquet_minmax_index( started_cluster, table, use_local_cache_for_remote_storage, enable_parquet_rowgroup_minmax_index, ): node = started_cluster.instances["h0_0_0"] result = node.query( """ DROP TABLE IF EXISTS default.{table}; CREATE TABLE default.{table} (`id` Nullable(String), `score` Nullable(Int32), `day` Nullable(String)) ENGINE = Hive('thrift://hivetest:9083', 'test', 'demo') PARTITION BY(day) SETTINGS enable_parquet_rowgroup_minmax_index = {enable_parquet_rowgroup_minmax_index} """.format( table=table, enable_parquet_rowgroup_minmax_index=enable_parquet_rowgroup_minmax_index, ) ) assert result.strip() == "" for i in range(2): result = node.query( """ SELECT day, id, score FROM default.{table} where day >= '2021-11-05' and day <= '2021-11-16' and score >= 15 and score <= 30 order by day, id SETTINGS use_local_cache_for_remote_storage = {use_local_cache_for_remote_storage} """.format( table=table, use_local_cache_for_remote_storage=use_local_cache_for_remote_storage, ) ) assert ( result == """2021-11-05 abd 15 2021-11-16 aaa 22 """ ) def test_hive_columns_prunning(started_cluster): logging.info("Start testing groupby ...") node = started_cluster.instances["h0_0_0"] result = node.query( """ SELECT count(*) FROM default.demo_parquet_parts where day = '2021-11-05' """ ) expected_result = """4 """ logging.info("query result:{}".format(result)) assert result == expected_result def test_text_count(started_cluster): node = started_cluster.instances["h0_0_0"] result = node.query( """ SELECT day, count(*) FROM default.demo_orc group by day order by day SETTINGS format_csv_delimiter = '\x01' """ ) expected_result = """2021-11-01 1 2021-11-05 2 2021-11-11 1 2021-11-16 2 """ assert result == expected_result def test_parquet_groupby_with_cache(started_cluster): logging.info("Start testing groupby ...") node = started_cluster.instances["h0_0_0"] result = node.query( """ SELECT day, count(*) FROM default.demo_parquet group by day order by day """ ) expected_result = """2021-11-01 1 2021-11-05 2 2021-11-11 1 2021-11-16 2 """ assert result == expected_result def test_parquet_groupby_by_hive_function(started_cluster): logging.info("Start testing groupby ...") node = started_cluster.instances["h0_0_0"] result = node.query( """ SELECT day, count(*) FROM hive('thrift://hivetest:9083', 'test', 'demo', '`id` Nullable(String), `score` Nullable(Int32), `day` Nullable(String)', 'day') group by day order by day """ ) expected_result = """2021-11-01 1 2021-11-05 2 2021-11-11 1 2021-11-16 2 """ assert result == expected_result def test_cache_read_bytes(started_cluster): node = started_cluster.instances["h0_0_0"] result = node.query( """ CREATE TABLE IF NOT EXISTS default.demo_parquet_1 (`id` Nullable(String), `score` Nullable(Int32), `day` Nullable(String)) ENGINE = Hive('thrift://hivetest:9083', 'test', 'demo') PARTITION BY(day) """ ) test_passed = False for i in range(10): result = node.query( """ SELECT day, count(*) FROM default.demo_parquet_1 group by day order by day settings input_format_parquet_allow_missing_columns = true """ ) node.query("system flush logs") result = node.query( "select sum(ProfileEvent_ExternalDataSourceLocalCacheReadBytes) from system.metric_log where ProfileEvent_ExternalDataSourceLocalCacheReadBytes > 0" ) if result.strip() == "0": logging.info("ProfileEvent_ExternalDataSourceLocalCacheReadBytes == 0") time.sleep(10) continue test_passed = True break assert test_passed