ClickHouse/tests/integration/test_parquet_page_index/test.py

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

111 lines
3.5 KiB
Python
Raw Normal View History

import os
import time
2024-09-27 10:19:39 +00:00
import pyarrow.parquet as pq
import pytest
from helpers.cluster import ClickHouseCluster
cluster = ClickHouseCluster(__file__)
path_to_userfiles = "/var/lib/clickhouse/user_files/"
node = cluster.add_instance("node", external_dirs=[path_to_userfiles])
@pytest.fixture(scope="module")
def start_cluster():
try:
cluster.start()
yield cluster
finally:
cluster.shutdown()
def check_page_index(file_path):
metadata = pq.read_metadata(file_path)
2024-08-01 11:11:52 +00:00
assert (
metadata
), "pyarrow.parquet library can't read parquet file written by Clickhouse"
return metadata.row_group(0).column(0).has_offset_index
def delete_if_exists(file_path):
if os.path.exists(file_path):
os.remove(file_path)
2024-08-01 11:11:52 +00:00
@pytest.mark.parametrize(
"query, expected_result",
{
(
2024-08-05 09:52:07 +00:00
"SELECT number, number+1 FROM system.numbers LIMIT 100 "
2024-08-01 11:11:52 +00:00
"INTO OUTFILE '{file_name}' FORMAT Parquet "
"SETTINGS output_format_parquet_use_custom_encoder = false, "
"output_format_parquet_write_page_index = true;",
True,
),
(
2024-08-02 11:53:48 +00:00
"SELECT number, number+1 FROM system.numbers LIMIT 100 "
2024-08-01 11:11:52 +00:00
"INTO OUTFILE '{file_name}' FORMAT Parquet "
"SETTINGS output_format_parquet_use_custom_encoder = false, "
"output_format_parquet_write_page_index = false;",
False,
),
# # default settings:
# # output_format_parquet_use_custom_encoder = true
(
2024-08-02 11:53:48 +00:00
"SELECT number, number+1 FROM system.numbers LIMIT 100 "
2024-08-01 11:11:52 +00:00
"INTO OUTFILE '{file_name}' FORMAT Parquet;",
False,
),
},
)
def test_parquet_page_index_select_into_outfile(query, expected_result, start_cluster):
2024-08-01 12:45:46 +00:00
file_name = f"export{time.time()}.parquet"
query = query.format(file_name=file_name)
delete_if_exists(file_name)
2024-08-01 11:11:52 +00:00
assert node.query(query) == ""
assert (
check_page_index(file_name) == expected_result
), "Page offset index have wrong value"
delete_if_exists(file_name)
2024-08-01 11:11:52 +00:00
@pytest.mark.parametrize(
"query, expected_result",
{
(
"INSERT INTO TABLE FUNCTION file('{file_name}') "
2024-08-01 12:45:46 +00:00
"SELECT number, number+1 FROM system.numbers LIMIT 100 "
2024-08-01 11:11:52 +00:00
"SETTINGS output_format_parquet_use_custom_encoder=false, "
"output_format_parquet_write_page_index=true FORMAT Parquet",
True,
),
(
"INSERT INTO TABLE FUNCTION file('{file_name}') "
2024-08-01 12:45:46 +00:00
"SELECT number, number+1 FROM system.numbers LIMIT 100 "
2024-08-01 11:11:52 +00:00
"SETTINGS output_format_parquet_use_custom_encoder=false, "
"output_format_parquet_write_page_index=false FORMAT Parquet",
False,
),
# # default settings:
# # output_format_parquet_use_custom_encoder = true
(
"INSERT INTO TABLE FUNCTION file('{file_name}') "
2024-08-01 12:45:46 +00:00
"SELECT number, number+1 FROM system.numbers LIMIT 100 FORMAT Parquet",
2024-08-01 11:11:52 +00:00
False,
),
},
)
def test_parquet_page_index_insert_into_table_function_file(
query, expected_result, start_cluster
):
2024-08-01 12:45:46 +00:00
file_name = f"export{time.time()}.parquet"
query = query.format(file_name=file_name)
2024-08-02 11:53:48 +00:00
file_path = f"{cluster.instances_dir}{path_to_userfiles}{file_name}"
delete_if_exists(file_path)
2024-08-01 11:11:52 +00:00
assert node.query(query) == ""
assert (
check_page_index(file_path) == expected_result
), "Page offset index have wrong value"
delete_if_exists(file_path)