ClickHouse/tests/integration/test_parquet_page_index/test.py
2024-09-27 10:19:49 +00:00

111 lines
3.5 KiB
Python

import os
import time
import pyarrow.parquet as pq
import pytest
from helpers.cluster import ClickHouseCluster
cluster = ClickHouseCluster(__file__)
path_to_userfiles = "/var/lib/clickhouse/user_files/"
node = cluster.add_instance("node", external_dirs=[path_to_userfiles])
@pytest.fixture(scope="module")
def start_cluster():
try:
cluster.start()
yield cluster
finally:
cluster.shutdown()
def check_page_index(file_path):
metadata = pq.read_metadata(file_path)
assert (
metadata
), "pyarrow.parquet library can't read parquet file written by Clickhouse"
return metadata.row_group(0).column(0).has_offset_index
def delete_if_exists(file_path):
if os.path.exists(file_path):
os.remove(file_path)
@pytest.mark.parametrize(
"query, expected_result",
{
(
"SELECT number, number+1 FROM system.numbers LIMIT 100 "
"INTO OUTFILE '{file_name}' FORMAT Parquet "
"SETTINGS output_format_parquet_use_custom_encoder = false, "
"output_format_parquet_write_page_index = true;",
True,
),
(
"SELECT number, number+1 FROM system.numbers LIMIT 100 "
"INTO OUTFILE '{file_name}' FORMAT Parquet "
"SETTINGS output_format_parquet_use_custom_encoder = false, "
"output_format_parquet_write_page_index = false;",
False,
),
# # default settings:
# # output_format_parquet_use_custom_encoder = true
(
"SELECT number, number+1 FROM system.numbers LIMIT 100 "
"INTO OUTFILE '{file_name}' FORMAT Parquet;",
False,
),
},
)
def test_parquet_page_index_select_into_outfile(query, expected_result, start_cluster):
file_name = f"export{time.time()}.parquet"
query = query.format(file_name=file_name)
delete_if_exists(file_name)
assert node.query(query) == ""
assert (
check_page_index(file_name) == expected_result
), "Page offset index have wrong value"
delete_if_exists(file_name)
@pytest.mark.parametrize(
"query, expected_result",
{
(
"INSERT INTO TABLE FUNCTION file('{file_name}') "
"SELECT number, number+1 FROM system.numbers LIMIT 100 "
"SETTINGS output_format_parquet_use_custom_encoder=false, "
"output_format_parquet_write_page_index=true FORMAT Parquet",
True,
),
(
"INSERT INTO TABLE FUNCTION file('{file_name}') "
"SELECT number, number+1 FROM system.numbers LIMIT 100 "
"SETTINGS output_format_parquet_use_custom_encoder=false, "
"output_format_parquet_write_page_index=false FORMAT Parquet",
False,
),
# # default settings:
# # output_format_parquet_use_custom_encoder = true
(
"INSERT INTO TABLE FUNCTION file('{file_name}') "
"SELECT number, number+1 FROM system.numbers LIMIT 100 FORMAT Parquet",
False,
),
},
)
def test_parquet_page_index_insert_into_table_function_file(
query, expected_result, start_cluster
):
file_name = f"export{time.time()}.parquet"
query = query.format(file_name=file_name)
file_path = f"{cluster.instances_dir}{path_to_userfiles}{file_name}"
delete_if_exists(file_path)
assert node.query(query) == ""
assert (
check_page_index(file_path) == expected_result
), "Page offset index have wrong value"
delete_if_exists(file_path)