2024-08-01 10:32:43 +00:00
|
|
|
import pytest
|
|
|
|
from helpers.cluster import ClickHouseCluster
|
|
|
|
import pyarrow.parquet as pq
|
|
|
|
import os
|
|
|
|
import time
|
|
|
|
|
|
|
|
cluster = ClickHouseCluster(__file__)
|
|
|
|
path_to_userfiles = "/var/lib/clickhouse/user_files/"
|
|
|
|
node = cluster.add_instance("node", external_dirs=[path_to_userfiles])
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
|
|
def start_cluster():
|
|
|
|
try:
|
|
|
|
cluster.start()
|
|
|
|
yield cluster
|
|
|
|
finally:
|
|
|
|
cluster.shutdown()
|
|
|
|
|
|
|
|
|
|
|
|
def check_page_index(file_path):
|
|
|
|
metadata = pq.read_metadata(file_path)
|
2024-08-01 11:11:52 +00:00
|
|
|
assert (
|
|
|
|
metadata
|
|
|
|
), "pyarrow.parquet library can't read parquet file written by Clickhouse"
|
2024-08-01 10:32:43 +00:00
|
|
|
return metadata.row_group(0).column(0).has_offset_index
|
|
|
|
|
|
|
|
|
|
|
|
def delete_if_exists(file_path):
|
|
|
|
if os.path.exists(file_path):
|
|
|
|
os.remove(file_path)
|
|
|
|
|
|
|
|
|
2024-08-01 11:11:52 +00:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"query, expected_result",
|
|
|
|
{
|
|
|
|
(
|
2024-08-05 09:52:07 +00:00
|
|
|
"SELECT number, number+1 FROM system.numbers LIMIT 100 "
|
2024-08-01 11:11:52 +00:00
|
|
|
"INTO OUTFILE '{file_name}' FORMAT Parquet "
|
|
|
|
"SETTINGS output_format_parquet_use_custom_encoder = false, "
|
|
|
|
"output_format_parquet_write_page_index = true;",
|
|
|
|
True,
|
|
|
|
),
|
|
|
|
(
|
2024-08-02 11:53:48 +00:00
|
|
|
"SELECT number, number+1 FROM system.numbers LIMIT 100 "
|
2024-08-01 11:11:52 +00:00
|
|
|
"INTO OUTFILE '{file_name}' FORMAT Parquet "
|
|
|
|
"SETTINGS output_format_parquet_use_custom_encoder = false, "
|
|
|
|
"output_format_parquet_write_page_index = false;",
|
|
|
|
False,
|
|
|
|
),
|
|
|
|
# # default settings:
|
|
|
|
# # output_format_parquet_use_custom_encoder = true
|
|
|
|
(
|
2024-08-02 11:53:48 +00:00
|
|
|
"SELECT number, number+1 FROM system.numbers LIMIT 100 "
|
2024-08-01 11:11:52 +00:00
|
|
|
"INTO OUTFILE '{file_name}' FORMAT Parquet;",
|
|
|
|
False,
|
|
|
|
),
|
|
|
|
},
|
|
|
|
)
|
2024-08-01 10:32:43 +00:00
|
|
|
def test_parquet_page_index_select_into_outfile(query, expected_result, start_cluster):
|
2024-08-01 12:45:46 +00:00
|
|
|
file_name = f"export{time.time()}.parquet"
|
2024-08-01 10:32:43 +00:00
|
|
|
query = query.format(file_name=file_name)
|
|
|
|
delete_if_exists(file_name)
|
2024-08-01 11:11:52 +00:00
|
|
|
assert node.query(query) == ""
|
|
|
|
assert (
|
|
|
|
check_page_index(file_name) == expected_result
|
|
|
|
), "Page offset index have wrong value"
|
2024-08-01 10:32:43 +00:00
|
|
|
delete_if_exists(file_name)
|
|
|
|
|
|
|
|
|
2024-08-01 11:11:52 +00:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"query, expected_result",
|
|
|
|
{
|
|
|
|
(
|
|
|
|
"INSERT INTO TABLE FUNCTION file('{file_name}') "
|
2024-08-01 12:45:46 +00:00
|
|
|
"SELECT number, number+1 FROM system.numbers LIMIT 100 "
|
2024-08-01 11:11:52 +00:00
|
|
|
"SETTINGS output_format_parquet_use_custom_encoder=false, "
|
|
|
|
"output_format_parquet_write_page_index=true FORMAT Parquet",
|
|
|
|
True,
|
|
|
|
),
|
|
|
|
(
|
|
|
|
"INSERT INTO TABLE FUNCTION file('{file_name}') "
|
2024-08-01 12:45:46 +00:00
|
|
|
"SELECT number, number+1 FROM system.numbers LIMIT 100 "
|
2024-08-01 11:11:52 +00:00
|
|
|
"SETTINGS output_format_parquet_use_custom_encoder=false, "
|
|
|
|
"output_format_parquet_write_page_index=false FORMAT Parquet",
|
|
|
|
False,
|
|
|
|
),
|
|
|
|
# # default settings:
|
|
|
|
# # output_format_parquet_use_custom_encoder = true
|
|
|
|
(
|
|
|
|
"INSERT INTO TABLE FUNCTION file('{file_name}') "
|
2024-08-01 12:45:46 +00:00
|
|
|
"SELECT number, number+1 FROM system.numbers LIMIT 100 FORMAT Parquet",
|
2024-08-01 11:11:52 +00:00
|
|
|
False,
|
|
|
|
),
|
|
|
|
},
|
|
|
|
)
|
|
|
|
def test_parquet_page_index_insert_into_table_function_file(
|
|
|
|
query, expected_result, start_cluster
|
|
|
|
):
|
2024-08-01 12:45:46 +00:00
|
|
|
file_name = f"export{time.time()}.parquet"
|
2024-08-01 10:32:43 +00:00
|
|
|
query = query.format(file_name=file_name)
|
2024-08-02 11:53:48 +00:00
|
|
|
file_path = f"{cluster.instances_dir}{path_to_userfiles}{file_name}"
|
2024-08-01 10:32:43 +00:00
|
|
|
delete_if_exists(file_path)
|
2024-08-01 11:11:52 +00:00
|
|
|
assert node.query(query) == ""
|
|
|
|
assert (
|
|
|
|
check_page_index(file_path) == expected_result
|
|
|
|
), "Page offset index have wrong value"
|
2024-08-01 10:32:43 +00:00
|
|
|
delete_if_exists(file_path)
|