ClickHouse/tests/integration/test_parquet_page_index/test.py

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

109 lines
3.5 KiB
Python
Raw Normal View History

import pytest
from helpers.cluster import ClickHouseCluster
import pyarrow.parquet as pq
import os
import time
cluster = ClickHouseCluster(__file__)
path_to_userfiles = "/var/lib/clickhouse/user_files/"
node = cluster.add_instance("node", external_dirs=[path_to_userfiles])
@pytest.fixture(scope="module")
def start_cluster():
try:
cluster.start()
yield cluster
finally:
cluster.shutdown()
def check_page_index(file_path):
metadata = pq.read_metadata(file_path)
2024-08-01 11:11:52 +00:00
assert (
metadata
), "pyarrow.parquet library can't read parquet file written by Clickhouse"
return metadata.row_group(0).column(0).has_offset_index
def delete_if_exists(file_path):
if os.path.exists(file_path):
os.remove(file_path)
2024-08-01 11:11:52 +00:00
@pytest.mark.parametrize(
"query, expected_result",
{
(
2024-08-05 09:52:07 +00:00
"SELECT number, number+1 FROM system.numbers LIMIT 100 "
2024-08-01 11:11:52 +00:00
"INTO OUTFILE '{file_name}' FORMAT Parquet "
"SETTINGS output_format_parquet_use_custom_encoder = false, "
"output_format_parquet_write_page_index = true;",
True,
),
(
2024-08-02 11:53:48 +00:00
"SELECT number, number+1 FROM system.numbers LIMIT 100 "
2024-08-01 11:11:52 +00:00
"INTO OUTFILE '{file_name}' FORMAT Parquet "
"SETTINGS output_format_parquet_use_custom_encoder = false, "
"output_format_parquet_write_page_index = false;",
False,
),
# # default settings:
# # output_format_parquet_use_custom_encoder = true
(
2024-08-02 11:53:48 +00:00
"SELECT number, number+1 FROM system.numbers LIMIT 100 "
2024-08-01 11:11:52 +00:00
"INTO OUTFILE '{file_name}' FORMAT Parquet;",
False,
),
},
)
def test_parquet_page_index_select_into_outfile(query, expected_result, start_cluster):
2024-08-01 12:45:46 +00:00
file_name = f"export{time.time()}.parquet"
query = query.format(file_name=file_name)
delete_if_exists(file_name)
2024-08-01 11:11:52 +00:00
assert node.query(query) == ""
assert (
check_page_index(file_name) == expected_result
), "Page offset index have wrong value"
delete_if_exists(file_name)
2024-08-01 11:11:52 +00:00
@pytest.mark.parametrize(
"query, expected_result",
{
(
"INSERT INTO TABLE FUNCTION file('{file_name}') "
2024-08-01 12:45:46 +00:00
"SELECT number, number+1 FROM system.numbers LIMIT 100 "
2024-08-01 11:11:52 +00:00
"SETTINGS output_format_parquet_use_custom_encoder=false, "
"output_format_parquet_write_page_index=true FORMAT Parquet",
True,
),
(
"INSERT INTO TABLE FUNCTION file('{file_name}') "
2024-08-01 12:45:46 +00:00
"SELECT number, number+1 FROM system.numbers LIMIT 100 "
2024-08-01 11:11:52 +00:00
"SETTINGS output_format_parquet_use_custom_encoder=false, "
"output_format_parquet_write_page_index=false FORMAT Parquet",
False,
),
# # default settings:
# # output_format_parquet_use_custom_encoder = true
(
"INSERT INTO TABLE FUNCTION file('{file_name}') "
2024-08-01 12:45:46 +00:00
"SELECT number, number+1 FROM system.numbers LIMIT 100 FORMAT Parquet",
2024-08-01 11:11:52 +00:00
False,
),
},
)
def test_parquet_page_index_insert_into_table_function_file(
query, expected_result, start_cluster
):
2024-08-01 12:45:46 +00:00
file_name = f"export{time.time()}.parquet"
query = query.format(file_name=file_name)
2024-08-02 11:53:48 +00:00
file_path = f"{cluster.instances_dir}{path_to_userfiles}{file_name}"
delete_if_exists(file_path)
2024-08-01 11:11:52 +00:00
assert node.query(query) == ""
assert (
check_page_index(file_path) == expected_result
), "Page offset index have wrong value"
delete_if_exists(file_path)