mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-11 17:02:25 +00:00
d4ad138a04
optimization for reading hive file when all columns to read are partition keys
403 lines
12 KiB
Python
403 lines
12 KiB
Python
import logging
|
|
import os
|
|
|
|
import time
|
|
import pytest
|
|
from helpers.cluster import ClickHouseCluster
|
|
from helpers.test_tools import TSV
|
|
|
|
logging.getLogger().setLevel(logging.INFO)
|
|
logging.getLogger().addHandler(logging.StreamHandler())
|
|
|
|
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def started_cluster():
|
|
try:
|
|
cluster = ClickHouseCluster(__file__)
|
|
cluster.add_instance(
|
|
"h0_0_0",
|
|
main_configs=["configs/config.xml"],
|
|
extra_configs=["configs/hdfs-site.xml"],
|
|
with_hive=True,
|
|
)
|
|
|
|
logging.info("Starting cluster ...")
|
|
cluster.start()
|
|
yield cluster
|
|
finally:
|
|
cluster.shutdown()
|
|
|
|
|
|
def test_create_parquet_table(started_cluster):
|
|
logging.info("Start testing creating hive table ...")
|
|
node = started_cluster.instances["h0_0_0"]
|
|
test_passed = False
|
|
for i in range(10):
|
|
node.query("set input_format_parquet_allow_missing_columns = true")
|
|
result = node.query(
|
|
"""
|
|
DROP TABLE IF EXISTS default.demo_parquet;
|
|
CREATE TABLE default.demo_parquet (`id` Nullable(String), `score` Nullable(Int32), `day` Nullable(String)) ENGINE = Hive('thrift://hivetest:9083', 'test', 'demo') PARTITION BY(day)
|
|
"""
|
|
)
|
|
logging.info("create result {}".format(result))
|
|
if result.strip() == "":
|
|
test_passed = True
|
|
break
|
|
time.sleep(60)
|
|
assert test_passed
|
|
|
|
|
|
def test_create_parquet_table_1(started_cluster):
|
|
logging.info("Start testing creating hive table ...")
|
|
node = started_cluster.instances["h0_0_0"]
|
|
for i in range(10):
|
|
node.query("set input_format_parquet_allow_missing_columns = true")
|
|
result = node.query(
|
|
"""
|
|
DROP TABLE IF EXISTS default.demo_parquet_parts;
|
|
CREATE TABLE default.demo_parquet_parts (`id` Nullable(String), `score` Nullable(Int32), `day` Nullable(String), `hour` String) ENGINE = Hive('thrift://hivetest:9083', 'test', 'parquet_demo') PARTITION BY(day, hour);
|
|
"""
|
|
)
|
|
logging.info("create result {}".format(result))
|
|
if result.strip() == "":
|
|
test_passed = True
|
|
break
|
|
time.sleep(60)
|
|
assert test_passed
|
|
|
|
|
|
def test_create_orc_table(started_cluster):
|
|
logging.info("Start testing creating hive table ...")
|
|
node = started_cluster.instances["h0_0_0"]
|
|
test_passed = False
|
|
for i in range(10):
|
|
result = node.query(
|
|
"""
|
|
DROP TABLE IF EXISTS default.demo_orc;
|
|
CREATE TABLE default.demo_orc (`id` Nullable(String), `score` Nullable(Int32), `day` Nullable(String)) ENGINE = Hive('thrift://hivetest:9083', 'test', 'demo_orc') PARTITION BY(day)
|
|
"""
|
|
)
|
|
logging.info("create result {}".format(result))
|
|
if result.strip() == "":
|
|
test_passed = True
|
|
break
|
|
time.sleep(60)
|
|
|
|
assert test_passed
|
|
|
|
|
|
def test_create_text_table(started_cluster):
|
|
logging.info("Start testing creating hive table ...")
|
|
node = started_cluster.instances["h0_0_0"]
|
|
result = node.query(
|
|
"""
|
|
DROP TABLE IF EXISTS default.demo_text;
|
|
CREATE TABLE default.demo_text (`id` Nullable(String), `score` Nullable(Int32), `day` Nullable(String)) ENGINE = Hive('thrift://hivetest:9083', 'test', 'demo_text') PARTITION BY (tuple())
|
|
"""
|
|
)
|
|
logging.info("create result {}".format(result))
|
|
|
|
assert result.strip() == ""
|
|
|
|
|
|
def test_parquet_groupby(started_cluster):
|
|
logging.info("Start testing groupby ...")
|
|
node = started_cluster.instances["h0_0_0"]
|
|
result = node.query(
|
|
"""
|
|
SELECT day, count(*) FROM default.demo_parquet group by day order by day
|
|
"""
|
|
)
|
|
expected_result = """2021-11-01 1
|
|
2021-11-05 2
|
|
2021-11-11 1
|
|
2021-11-16 2
|
|
"""
|
|
assert result == expected_result
|
|
|
|
|
|
def test_parquet_in_filter(started_cluster):
|
|
logging.info("Start testing groupby ...")
|
|
node = started_cluster.instances["h0_0_0"]
|
|
result = node.query(
|
|
"""
|
|
SELECT count(*) FROM default.demo_parquet_parts where day = '2021-11-05' and hour in ('00')
|
|
"""
|
|
)
|
|
expected_result = """2
|
|
"""
|
|
logging.info("query result:{}".format(result))
|
|
assert result == expected_result
|
|
|
|
|
|
def test_orc_groupby(started_cluster):
|
|
logging.info("Start testing groupby ...")
|
|
node = started_cluster.instances["h0_0_0"]
|
|
result = node.query(
|
|
"""
|
|
SELECT day, count(*) FROM default.demo_orc group by day order by day
|
|
"""
|
|
)
|
|
expected_result = """2021-11-01 1
|
|
2021-11-05 2
|
|
2021-11-11 1
|
|
2021-11-16 2
|
|
"""
|
|
assert result == expected_result
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"table,use_local_cache_for_remote_storage,enable_orc_file_minmax_index,enable_orc_stripe_minmax_index",
|
|
[
|
|
pytest.param(
|
|
"demo_orc_no_cache_no_index",
|
|
"false",
|
|
"false",
|
|
"false",
|
|
id="demo_orc_no_cache_no_index",
|
|
),
|
|
pytest.param(
|
|
"demo_orc_with_cache_no_index",
|
|
"true",
|
|
"false",
|
|
"false",
|
|
id="demo_orc_with_cache_no_index",
|
|
),
|
|
pytest.param(
|
|
"demo_orc_no_cache_file_index",
|
|
"false",
|
|
"true",
|
|
"false",
|
|
id="demo_orc_no_cache_file_index",
|
|
),
|
|
pytest.param(
|
|
"demo_orc_with_cache_file_index",
|
|
"true",
|
|
"true",
|
|
"false",
|
|
id="demo_orc_with_cache_file_index",
|
|
),
|
|
pytest.param(
|
|
"demo_orc_no_cache_stripe_index",
|
|
"false",
|
|
"true",
|
|
"true",
|
|
id="demo_orc_no_cache_stripe_index",
|
|
),
|
|
pytest.param(
|
|
"demo_orc_with_cache_stripe_index",
|
|
"true",
|
|
"true",
|
|
"true",
|
|
id="demo_orc_with_cache_stripe_index",
|
|
),
|
|
],
|
|
)
|
|
def test_orc_minmax_index(
|
|
started_cluster,
|
|
table,
|
|
use_local_cache_for_remote_storage,
|
|
enable_orc_file_minmax_index,
|
|
enable_orc_stripe_minmax_index,
|
|
):
|
|
node = started_cluster.instances["h0_0_0"]
|
|
result = node.query(
|
|
"""
|
|
DROP TABLE IF EXISTS default.{table};
|
|
CREATE TABLE default.{table} (`id` Nullable(String), `score` Nullable(Int32), `day` Nullable(String)) ENGINE = Hive('thrift://hivetest:9083', 'test', 'demo_orc') PARTITION BY(day)
|
|
SETTINGS enable_orc_file_minmax_index = {enable_orc_file_minmax_index}, enable_orc_stripe_minmax_index = {enable_orc_stripe_minmax_index};
|
|
""".format(
|
|
table=table,
|
|
enable_orc_file_minmax_index=enable_orc_file_minmax_index,
|
|
enable_orc_stripe_minmax_index=enable_orc_stripe_minmax_index,
|
|
)
|
|
)
|
|
assert result.strip() == ""
|
|
|
|
for i in range(2):
|
|
result = node.query(
|
|
"""
|
|
SELECT day, id, score FROM default.{table} where day >= '2021-11-05' and day <= '2021-11-16' and score >= 15 and score <= 30 order by day, id
|
|
SETTINGS use_local_cache_for_remote_storage = {use_local_cache_for_remote_storage}
|
|
""".format(
|
|
table=table,
|
|
use_local_cache_for_remote_storage=use_local_cache_for_remote_storage,
|
|
)
|
|
)
|
|
|
|
assert (
|
|
result
|
|
== """2021-11-05 abd 15
|
|
2021-11-16 aaa 22
|
|
"""
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"table,use_local_cache_for_remote_storage,enable_parquet_rowgroup_minmax_index",
|
|
[
|
|
pytest.param(
|
|
"demo_parquet_no_cache_no_index",
|
|
"false",
|
|
"false",
|
|
id="demo_parquet_no_cache_no_index",
|
|
),
|
|
pytest.param(
|
|
"demo_parquet_with_cache_no_index",
|
|
"true",
|
|
"false",
|
|
id="demo_parquet_with_cache_no_index",
|
|
),
|
|
pytest.param(
|
|
"demo_parquet_no_cache_rowgroup_index",
|
|
"false",
|
|
"true",
|
|
id="demo_parquet_no_cache_rowgroup_index",
|
|
),
|
|
pytest.param(
|
|
"demo_parquet_with_cache_rowgroup_index",
|
|
"true",
|
|
"true",
|
|
id="demo_parquet_with_cache_rowgroup_index",
|
|
),
|
|
],
|
|
)
|
|
def test_parquet_minmax_index(
|
|
started_cluster,
|
|
table,
|
|
use_local_cache_for_remote_storage,
|
|
enable_parquet_rowgroup_minmax_index,
|
|
):
|
|
node = started_cluster.instances["h0_0_0"]
|
|
result = node.query(
|
|
"""
|
|
DROP TABLE IF EXISTS default.{table};
|
|
CREATE TABLE default.{table} (`id` Nullable(String), `score` Nullable(Int32), `day` Nullable(String)) ENGINE = Hive('thrift://hivetest:9083', 'test', 'demo') PARTITION BY(day)
|
|
SETTINGS enable_parquet_rowgroup_minmax_index = {enable_parquet_rowgroup_minmax_index}
|
|
""".format(
|
|
table=table,
|
|
enable_parquet_rowgroup_minmax_index=enable_parquet_rowgroup_minmax_index,
|
|
)
|
|
)
|
|
assert result.strip() == ""
|
|
|
|
for i in range(2):
|
|
result = node.query(
|
|
"""
|
|
SELECT day, id, score FROM default.{table} where day >= '2021-11-05' and day <= '2021-11-16' and score >= 15 and score <= 30 order by day, id
|
|
SETTINGS use_local_cache_for_remote_storage = {use_local_cache_for_remote_storage}
|
|
""".format(
|
|
table=table,
|
|
use_local_cache_for_remote_storage=use_local_cache_for_remote_storage,
|
|
)
|
|
)
|
|
|
|
assert (
|
|
result
|
|
== """2021-11-05 abd 15
|
|
2021-11-16 aaa 22
|
|
"""
|
|
)
|
|
|
|
|
|
def test_hive_columns_prunning(started_cluster):
|
|
logging.info("Start testing groupby ...")
|
|
node = started_cluster.instances["h0_0_0"]
|
|
result = node.query(
|
|
"""
|
|
SELECT count(*) FROM default.demo_parquet_parts where day = '2021-11-05'
|
|
"""
|
|
)
|
|
expected_result = """4
|
|
"""
|
|
logging.info("query result:{}".format(result))
|
|
assert result == expected_result
|
|
|
|
|
|
def test_text_count(started_cluster):
|
|
node = started_cluster.instances["h0_0_0"]
|
|
result = node.query(
|
|
"""
|
|
SELECT day, count(*) FROM default.demo_orc group by day order by day SETTINGS format_csv_delimiter = '\x01'
|
|
"""
|
|
)
|
|
expected_result = """2021-11-01 1
|
|
2021-11-05 2
|
|
2021-11-11 1
|
|
2021-11-16 2
|
|
"""
|
|
assert result == expected_result
|
|
|
|
|
|
def test_parquet_groupby_with_cache(started_cluster):
|
|
logging.info("Start testing groupby ...")
|
|
node = started_cluster.instances["h0_0_0"]
|
|
result = node.query(
|
|
"""
|
|
SELECT day, count(*) FROM default.demo_parquet group by day order by day
|
|
"""
|
|
)
|
|
expected_result = """2021-11-01 1
|
|
2021-11-05 2
|
|
2021-11-11 1
|
|
2021-11-16 2
|
|
"""
|
|
assert result == expected_result
|
|
|
|
|
|
def test_parquet_groupby_by_hive_function(started_cluster):
|
|
logging.info("Start testing groupby ...")
|
|
node = started_cluster.instances["h0_0_0"]
|
|
result = node.query(
|
|
"""
|
|
SELECT day, count(*) FROM hive('thrift://hivetest:9083', 'test', 'demo', '`id` Nullable(String), `score` Nullable(Int32), `day` Nullable(String)', 'day') group by day order by day
|
|
"""
|
|
)
|
|
expected_result = """2021-11-01 1
|
|
2021-11-05 2
|
|
2021-11-11 1
|
|
2021-11-16 2
|
|
"""
|
|
assert result == expected_result
|
|
|
|
|
|
def test_cache_read_bytes(started_cluster):
|
|
node = started_cluster.instances["h0_0_0"]
|
|
result = node.query(
|
|
"""
|
|
CREATE TABLE IF NOT EXISTS default.demo_parquet_1 (`id` Nullable(String), `score` Nullable(Int32), `day` Nullable(String)) ENGINE = Hive('thrift://hivetest:9083', 'test', 'demo') PARTITION BY(day)
|
|
"""
|
|
)
|
|
test_passed = False
|
|
for i in range(10):
|
|
result = node.query(
|
|
"""
|
|
SELECT * FROM default.demo_parquet_1 settings input_format_parquet_allow_missing_columns = true
|
|
"""
|
|
)
|
|
node.query("system flush logs")
|
|
result = node.query(
|
|
"select sum(ProfileEvent_ExternalDataSourceLocalCacheReadBytes) from system.metric_log where ProfileEvent_ExternalDataSourceLocalCacheReadBytes > 0"
|
|
)
|
|
if result.strip() == "0":
|
|
logging.info("ProfileEvent_ExternalDataSourceLocalCacheReadBytes == 0")
|
|
time.sleep(10)
|
|
continue
|
|
test_passed = True
|
|
break
|
|
assert test_passed
|
|
|
|
|
|
def test_cache_dir_use(started_cluster):
|
|
node = started_cluster.instances["h0_0_0"]
|
|
result0 = node.exec_in_container(
|
|
["bash", "-c", "ls /tmp/clickhouse_local_cache | wc -l"]
|
|
)
|
|
result1 = node.exec_in_container(
|
|
["bash", "-c", "ls /tmp/clickhouse_local_cache1 | wc -l"]
|
|
)
|
|
assert result0 != "0" and result1 != "0"
|