ClickHouse/tests/integration/test_storage_iceberg/test.py

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

1074 lines
36 KiB
Python
Raw Normal View History

2024-09-27 10:19:39 +00:00
import glob
import json
import logging
import os
import time
import uuid
2024-09-27 10:19:39 +00:00
from datetime import datetime
2023-01-18 08:33:55 +00:00
2024-09-27 10:19:39 +00:00
import pyspark
import pytest
from azure.storage.blob import BlobServiceClient
from minio.deleteobjects import DeleteObject
from pyspark.sql.functions import (
current_timestamp,
monotonically_increasing_id,
row_number,
)
from pyspark.sql.readwriter import DataFrameWriter, DataFrameWriterV2
2023-04-03 14:57:49 +00:00
from pyspark.sql.types import (
2024-09-27 10:19:39 +00:00
ArrayType,
BooleanType,
2023-04-03 14:57:49 +00:00
DateType,
2024-09-27 10:19:39 +00:00
IntegerType,
StringType,
StructField,
StructType,
2023-04-03 14:57:49 +00:00
TimestampType,
)
from pyspark.sql.window import Window
2024-09-27 10:19:39 +00:00
import helpers.client
2024-10-10 13:37:17 +00:00
from helpers.cluster import ClickHouseCluster, ClickHouseInstance, is_arm
2024-08-12 13:45:00 +00:00
from helpers.s3_tools import (
2024-07-31 10:29:12 +00:00
AzureUploader,
2024-10-02 11:37:02 +00:00
HDFSUploader,
2024-07-31 10:29:12 +00:00
LocalUploader,
2024-09-27 10:19:39 +00:00
S3Uploader,
get_file_contents,
list_s3_objects,
prepare_s3_bucket,
2024-01-22 17:24:48 +00:00
)
2024-09-27 10:19:39 +00:00
from helpers.test_tools import TSV
2023-01-18 08:33:55 +00:00
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
2023-01-18 08:33:55 +00:00
2023-04-11 15:23:05 +00:00
def get_spark():
builder = (
pyspark.sql.SparkSession.builder.appName("spark_test")
.config(
"spark.sql.catalog.spark_catalog",
"org.apache.iceberg.spark.SparkSessionCatalog",
)
.config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog")
.config("spark.sql.catalog.spark_catalog.type", "hadoop")
.config("spark.sql.catalog.spark_catalog.warehouse", "/iceberg_data")
2023-10-18 11:09:39 +00:00
.config(
"spark.sql.extensions",
"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
)
2023-04-11 15:23:05 +00:00
.master("local")
)
return builder.master("local").getOrCreate()
2023-01-18 08:33:55 +00:00
@pytest.fixture(scope="module")
def started_cluster():
try:
2023-04-13 13:10:49 +00:00
cluster = ClickHouseCluster(__file__, with_spark=True)
2024-10-10 10:58:38 +00:00
with_hdfs = True
if is_arm():
with_hdfs = False
2023-03-30 21:09:12 +00:00
cluster.add_instance(
"node1",
2024-10-03 15:42:50 +00:00
main_configs=[
2024-11-18 17:37:24 +00:00
"configs/config.d/query_log.xml",
"configs/config.d/cluster.xml",
2024-10-03 15:42:50 +00:00
"configs/config.d/named_collections.xml",
"configs/config.d/filesystem_caches.xml",
],
2023-06-14 11:45:53 +00:00
user_configs=["configs/users.d/users.xml"],
2023-03-30 21:09:12 +00:00
with_minio=True,
2024-07-31 10:29:12 +00:00
with_azurite=True,
2024-10-10 11:00:10 +00:00
with_hdfs=with_hdfs,
2024-11-18 17:37:24 +00:00
stay_alive=True,
)
cluster.add_instance(
"node2",
main_configs=[
"configs/config.d/query_log.xml",
"configs/config.d/cluster.xml",
"configs/config.d/named_collections.xml",
"configs/config.d/filesystem_caches.xml",
],
user_configs=["configs/users.d/users.xml"],
stay_alive=True,
)
cluster.add_instance(
"node3",
main_configs=[
"configs/config.d/query_log.xml",
"configs/config.d/cluster.xml",
"configs/config.d/named_collections.xml",
"configs/config.d/filesystem_caches.xml",
],
user_configs=["configs/users.d/users.xml"],
stay_alive=True,
2023-03-30 21:09:12 +00:00
)
2023-01-18 08:33:55 +00:00
logging.info("Starting cluster...")
cluster.start()
prepare_s3_bucket(cluster)
logging.info("S3 bucket created")
2023-04-11 15:23:05 +00:00
cluster.spark_session = get_spark()
2024-07-31 10:29:12 +00:00
cluster.default_s3_uploader = S3Uploader(
cluster.minio_client, cluster.minio_bucket
)
2024-08-07 16:46:33 +00:00
cluster.azure_container_name = "mycontainer"
2024-07-31 10:29:12 +00:00
2024-08-12 12:15:54 +00:00
cluster.blob_service_client = cluster.blob_service_client
container_client = cluster.blob_service_client.create_container(
2024-08-07 16:46:33 +00:00
cluster.azure_container_name
2024-07-31 10:29:12 +00:00
)
2023-04-11 15:23:05 +00:00
2024-08-07 16:46:33 +00:00
cluster.container_client = container_client
2024-08-12 12:15:54 +00:00
cluster.default_azure_uploader = AzureUploader(
cluster.blob_service_client, cluster.azure_container_name
)
2024-10-02 17:03:11 +00:00
cluster.default_hdfs_uploader = HDFSUploader(cluster)
2024-10-02 11:15:16 +00:00
2024-08-12 12:15:54 +00:00
cluster.default_local_uploader = LocalUploader(cluster.instances["node1"])
2024-08-07 16:46:33 +00:00
2023-01-18 08:33:55 +00:00
yield cluster
finally:
cluster.shutdown()
def run_query(instance, query, stdin=None, settings=None):
# type: (ClickHouseInstance, str, object, dict) -> str
logging.info("Running query '{}'...".format(query))
result = instance.query(query, stdin=stdin, settings=settings)
logging.info("Query finished")
return result
2023-04-03 14:57:49 +00:00
def write_iceberg_from_file(
2023-04-03 18:56:10 +00:00
spark, path, table_name, mode="overwrite", format_version="1", partition_by=None
2023-04-03 14:57:49 +00:00
):
if mode == "overwrite":
2023-04-03 18:56:10 +00:00
if partition_by is None:
spark.read.load(f"file://{path}").writeTo(table_name).tableProperty(
"format-version", format_version
).using("iceberg").create()
else:
spark.read.load(f"file://{path}").writeTo(table_name).partitionedBy(
partition_by
).tableProperty("format-version", format_version).using("iceberg").create()
2023-04-03 14:57:49 +00:00
else:
spark.read.load(f"file://{path}").writeTo(table_name).append()
2023-04-03 18:56:10 +00:00
def write_iceberg_from_df(
spark, df, table_name, mode="overwrite", format_version="1", partition_by=None
):
2023-04-03 14:57:49 +00:00
if mode == "overwrite":
2023-04-03 18:56:10 +00:00
if partition_by is None:
df.writeTo(table_name).tableProperty(
"format-version", format_version
).using("iceberg").create()
else:
df.writeTo(table_name).tableProperty(
"format-version", format_version
).partitionedBy(partition_by).using("iceberg").create()
2023-04-03 14:57:49 +00:00
else:
df.writeTo(table_name).append()
def generate_data(spark, start, end):
a = spark.range(start, end, 1).toDF("a")
b = spark.range(start + 1, end + 1, 1).toDF("b")
b = b.withColumn("b", b["b"].cast(StringType()))
2023-03-30 21:09:12 +00:00
2023-04-03 14:57:49 +00:00
a = a.withColumn(
"row_index", row_number().over(Window.orderBy(monotonically_increasing_id()))
)
b = b.withColumn(
"row_index", row_number().over(Window.orderBy(monotonically_increasing_id()))
)
2023-03-30 21:09:12 +00:00
2023-04-03 14:57:49 +00:00
df = a.join(b, on=["row_index"]).drop("row_index")
return df
2023-01-18 08:33:55 +00:00
2024-08-14 11:35:37 +00:00
def get_creation_expression(
2024-08-07 16:46:33 +00:00
storage_type,
table_name,
cluster,
format="Parquet",
table_function=False,
2024-11-18 17:37:24 +00:00
run_on_cluster=False,
2024-08-07 16:46:33 +00:00
**kwargs,
):
2024-08-12 12:15:54 +00:00
if storage_type == "s3":
2024-08-07 16:46:33 +00:00
if "bucket" in kwargs:
bucket = kwargs["bucket"]
else:
bucket = cluster.minio_bucket
2024-11-18 17:37:24 +00:00
if run_on_cluster:
assert table_function
return f"icebergS3Cluster('cluster_simple', s3, filename = 'iceberg_data/default/{table_name}/', format={format}, url = 'http://minio1:9001/{bucket}/')"
2024-08-14 11:35:37 +00:00
else:
2024-11-18 17:37:24 +00:00
if table_function:
return f"icebergS3(s3, filename = 'iceberg_data/default/{table_name}/', format={format}, url = 'http://minio1:9001/{bucket}/')"
else:
return f"""
DROP TABLE IF EXISTS {table_name};
CREATE TABLE {table_name}
ENGINE=IcebergS3(s3, filename = 'iceberg_data/default/{table_name}/', format={format}, url = 'http://minio1:9001/{bucket}/')"""
2024-07-31 10:29:12 +00:00
elif storage_type == "azure":
2024-11-18 17:37:24 +00:00
if run_on_cluster:
assert table_function
2024-08-07 16:46:33 +00:00
return f"""
2024-11-18 17:37:24 +00:00
icebergAzureCluster('cluster_simple', azure, container = '{cluster.azure_container_name}', storage_account_url = '{cluster.env_variables["AZURITE_STORAGE_ACCOUNT_URL"]}', blob_path = '/iceberg_data/default/{table_name}/', format={format})
2024-08-07 16:46:33 +00:00
"""
2024-08-14 11:35:37 +00:00
else:
2024-11-18 17:37:24 +00:00
if table_function:
return f"""
icebergAzure(azure, container = '{cluster.azure_container_name}', storage_account_url = '{cluster.env_variables["AZURITE_STORAGE_ACCOUNT_URL"]}', blob_path = '/iceberg_data/default/{table_name}/', format={format})
"""
else:
return f"""
DROP TABLE IF EXISTS {table_name};
CREATE TABLE {table_name}
ENGINE=IcebergAzure(azure, container = {cluster.azure_container_name}, storage_account_url = '{cluster.env_variables["AZURITE_STORAGE_ACCOUNT_URL"]}', blob_path = '/iceberg_data/default/{table_name}/', format={format})"""
2024-10-02 11:15:16 +00:00
elif storage_type == "hdfs":
2024-11-18 17:37:24 +00:00
if run_on_cluster:
assert table_function
2024-10-02 11:15:16 +00:00
return f"""
2024-11-18 17:37:24 +00:00
icebergHDFSCluster('cluster_simple', hdfs, filename= 'iceberg_data/default/{table_name}/', format={format}, url = 'hdfs://hdfs1:9000/')
2024-10-02 11:15:16 +00:00
"""
else:
2024-11-18 17:37:24 +00:00
if table_function:
return f"""
icebergHDFS(hdfs, filename= 'iceberg_data/default/{table_name}/', format={format}, url = 'hdfs://hdfs1:9000/')
"""
else:
return f"""
DROP TABLE IF EXISTS {table_name};
CREATE TABLE {table_name}
ENGINE=IcebergHDFS(hdfs, filename = 'iceberg_data/default/{table_name}/', format={format}, url = 'hdfs://hdfs1:9000/');"""
2024-08-12 12:15:54 +00:00
elif storage_type == "local":
2024-11-18 17:37:24 +00:00
assert not run_on_cluster
2024-08-12 12:15:54 +00:00
if table_function:
return f"""
icebergLocal(local, path = '/iceberg_data/default/{table_name}/', format={format})
"""
2024-08-14 11:35:37 +00:00
else:
return f"""
DROP TABLE IF EXISTS {table_name};
CREATE TABLE {table_name}
ENGINE=IcebergLocal(local, path = '/iceberg_data/default/{table_name}/', format={format});"""
2024-11-18 17:37:24 +00:00
2024-07-31 10:29:12 +00:00
else:
2024-08-12 12:15:54 +00:00
raise Exception(f"Unknown iceberg storage type: {storage_type}")
2023-01-18 08:33:55 +00:00
2024-08-14 11:35:37 +00:00
def get_uuid_str():
return str(uuid.uuid4()).replace("-", "_")
def create_iceberg_table(
storage_type,
node,
table_name,
cluster,
format="Parquet",
**kwargs,
):
node.query(
get_creation_expression(storage_type, table_name, cluster, format, **kwargs)
)
2023-04-05 18:32:37 +00:00
def create_initial_data_file(
cluster, node, query, table_name, compression_method="none"
):
2023-03-30 21:09:12 +00:00
node.query(
f"""
INSERT INTO TABLE FUNCTION
file('{table_name}.parquet')
SETTINGS
output_format_parquet_compression_method='{compression_method}',
s3_truncate_on_insert=1 {query}
FORMAT Parquet"""
)
2023-04-05 18:32:37 +00:00
user_files_path = os.path.join(
SCRIPT_DIR, f"{cluster.instances_dir_name}/node1/database/user_files"
)
result_path = f"{user_files_path}/{table_name}.parquet"
2023-03-30 21:09:12 +00:00
return result_path
2024-08-12 12:15:54 +00:00
def default_upload_directory(
started_cluster, storage_type, local_path, remote_path, **kwargs
):
2024-07-31 10:29:12 +00:00
if storage_type == "local":
2024-08-12 12:15:54 +00:00
return started_cluster.default_local_uploader.upload_directory(
local_path, remote_path, **kwargs
)
2024-10-02 11:15:16 +00:00
elif storage_type == "hdfs":
return started_cluster.default_hdfs_uploader.upload_directory(
local_path, remote_path, **kwargs
)
2024-07-31 10:29:12 +00:00
elif storage_type == "s3":
2024-08-12 12:15:54 +00:00
print(kwargs)
2024-07-31 10:29:12 +00:00
return started_cluster.default_s3_uploader.upload_directory(
2024-08-12 12:15:54 +00:00
local_path, remote_path, **kwargs
2024-07-31 10:29:12 +00:00
)
elif storage_type == "azure":
return started_cluster.default_azure_uploader.upload_directory(
2024-08-12 12:15:54 +00:00
local_path, remote_path, **kwargs
2024-07-31 10:29:12 +00:00
)
else:
2024-08-12 12:15:54 +00:00
raise Exception(f"Unknown iceberg storage type: {storage_type}")
2024-07-31 10:29:12 +00:00
2023-04-03 14:57:49 +00:00
@pytest.mark.parametrize("format_version", ["1", "2"])
2024-10-02 11:15:16 +00:00
@pytest.mark.parametrize("storage_type", ["s3", "azure", "hdfs", "local"])
2024-07-31 10:29:12 +00:00
def test_single_iceberg_file(started_cluster, format_version, storage_type):
2024-10-10 10:58:38 +00:00
if is_arm() and storage_type == "hdfs":
pytest.skip("Disabled test IcebergHDFS for aarch64")
2023-03-30 21:09:12 +00:00
instance = started_cluster.instances["node1"]
2023-04-11 15:23:05 +00:00
spark = started_cluster.spark_session
2024-08-14 11:35:37 +00:00
TABLE_NAME = (
"test_single_iceberg_file_"
+ format_version
+ "_"
+ storage_type
+ "_"
+ get_uuid_str()
)
2023-01-18 08:33:55 +00:00
2024-07-31 10:29:12 +00:00
write_iceberg_from_df(spark, generate_data(spark, 0, 100), TABLE_NAME)
2023-03-30 21:09:12 +00:00
2024-07-31 10:29:12 +00:00
files = default_upload_directory(
started_cluster,
storage_type,
f"/iceberg_data/default/{TABLE_NAME}/",
2024-08-12 12:15:54 +00:00
f"/iceberg_data/default/{TABLE_NAME}/",
2024-07-31 10:29:12 +00:00
)
2024-08-07 16:46:33 +00:00
create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster)
2024-07-31 10:29:12 +00:00
assert instance.query(f"SELECT * FROM {TABLE_NAME}") == instance.query(
2024-07-31 10:29:12 +00:00
"SELECT number, toString(number + 1) FROM numbers(100)"
2023-01-18 08:33:55 +00:00
)
2023-03-30 21:09:12 +00:00
2023-04-03 18:56:10 +00:00
@pytest.mark.parametrize("format_version", ["1", "2"])
2024-10-02 11:15:16 +00:00
@pytest.mark.parametrize("storage_type", ["s3", "azure", "hdfs", "local"])
2024-08-07 16:46:33 +00:00
def test_partition_by(started_cluster, format_version, storage_type):
2024-10-10 10:58:38 +00:00
if is_arm() and storage_type == "hdfs":
pytest.skip("Disabled test IcebergHDFS for aarch64")
2023-04-03 18:56:10 +00:00
instance = started_cluster.instances["node1"]
2023-04-11 15:23:05 +00:00
spark = started_cluster.spark_session
2024-08-14 11:35:37 +00:00
TABLE_NAME = (
"test_partition_by_"
+ format_version
+ "_"
+ storage_type
+ "_"
+ get_uuid_str()
)
2023-04-03 18:56:10 +00:00
write_iceberg_from_df(
spark,
generate_data(spark, 0, 10),
TABLE_NAME,
mode="overwrite",
format_version=format_version,
partition_by="a",
)
2024-08-07 16:46:33 +00:00
files = default_upload_directory(
2024-08-12 12:15:54 +00:00
started_cluster,
storage_type,
f"/iceberg_data/default/{TABLE_NAME}/",
f"/iceberg_data/default/{TABLE_NAME}/",
2023-04-03 18:56:10 +00:00
)
assert len(files) == 14 # 10 partitiions + 4 metadata files
2024-08-07 16:46:33 +00:00
create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster)
2023-04-03 18:56:10 +00:00
assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 10
2023-04-03 14:57:49 +00:00
@pytest.mark.parametrize("format_version", ["1", "2"])
2024-10-02 11:15:16 +00:00
@pytest.mark.parametrize("storage_type", ["s3", "azure", "hdfs", "local"])
2024-08-07 16:46:33 +00:00
def test_multiple_iceberg_files(started_cluster, format_version, storage_type):
2024-10-10 10:58:38 +00:00
if is_arm() and storage_type == "hdfs":
pytest.skip("Disabled test IcebergHDFS for aarch64")
2023-04-03 14:57:49 +00:00
instance = started_cluster.instances["node1"]
2023-04-11 15:23:05 +00:00
spark = started_cluster.spark_session
2023-04-03 14:57:49 +00:00
minio_client = started_cluster.minio_client
bucket = started_cluster.minio_bucket
2024-08-14 11:35:37 +00:00
TABLE_NAME = (
"test_multiple_iceberg_files_"
+ format_version
+ "_"
+ storage_type
+ "_"
+ get_uuid_str()
)
2023-04-03 14:57:49 +00:00
write_iceberg_from_df(
2023-04-03 18:56:10 +00:00
spark,
generate_data(spark, 0, 100),
TABLE_NAME,
mode="overwrite",
format_version=format_version,
2023-04-03 14:57:49 +00:00
)
2024-08-07 16:46:33 +00:00
files = default_upload_directory(
started_cluster,
storage_type,
f"/iceberg_data/default/{TABLE_NAME}/",
2024-08-12 12:15:54 +00:00
f"/iceberg_data/default/{TABLE_NAME}/",
2023-04-03 14:57:49 +00:00
)
2024-08-07 16:46:33 +00:00
2023-04-03 14:57:49 +00:00
# ['/iceberg_data/default/test_multiple_iceberg_files/data/00000-1-35302d56-f1ed-494e-a85b-fbf85c05ab39-00001.parquet',
# '/iceberg_data/default/test_multiple_iceberg_files/metadata/version-hint.text',
# '/iceberg_data/default/test_multiple_iceberg_files/metadata/3127466b-299d-48ca-a367-6b9b1df1e78c-m0.avro',
# '/iceberg_data/default/test_multiple_iceberg_files/metadata/snap-5220855582621066285-1-3127466b-299d-48ca-a367-6b9b1df1e78c.avro',
# '/iceberg_data/default/test_multiple_iceberg_files/metadata/v1.metadata.json']
assert len(files) == 5
2024-08-07 16:46:33 +00:00
create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster)
2023-04-03 14:57:49 +00:00
assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100
write_iceberg_from_df(
2023-04-03 18:56:10 +00:00
spark,
generate_data(spark, 100, 200),
TABLE_NAME,
mode="append",
format_version=format_version,
2023-04-03 14:57:49 +00:00
)
2024-08-07 16:46:33 +00:00
files = default_upload_directory(
started_cluster,
storage_type,
f"/iceberg_data/default/{TABLE_NAME}/",
"",
2023-04-03 14:57:49 +00:00
)
assert len(files) == 9
assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 200
assert instance.query(f"SELECT * FROM {TABLE_NAME} ORDER BY 1") == instance.query(
"SELECT number, toString(number + 1) FROM numbers(200)"
)
@pytest.mark.parametrize("format_version", ["1", "2"])
2024-10-02 11:15:16 +00:00
@pytest.mark.parametrize("storage_type", ["s3", "azure", "hdfs", "local"])
2024-08-07 16:46:33 +00:00
def test_types(started_cluster, format_version, storage_type):
2024-10-10 10:58:38 +00:00
if is_arm() and storage_type == "hdfs":
pytest.skip("Disabled test IcebergHDFS for aarch64")
2023-03-30 21:09:12 +00:00
instance = started_cluster.instances["node1"]
2023-04-11 15:23:05 +00:00
spark = started_cluster.spark_session
2024-08-14 11:35:37 +00:00
TABLE_NAME = (
"test_types_" + format_version + "_" + storage_type + "_" + get_uuid_str()
)
2023-04-03 14:57:49 +00:00
data = [
(
123,
"string",
datetime.strptime("2000-01-01", "%Y-%m-%d"),
["str1", "str2"],
True,
)
]
schema = StructType(
[
StructField("a", IntegerType()),
StructField("b", StringType()),
StructField("c", DateType()),
StructField("d", ArrayType(StringType())),
StructField("e", BooleanType()),
]
)
df = spark.createDataFrame(data=data, schema=schema)
df.printSchema()
2023-04-03 18:56:10 +00:00
write_iceberg_from_df(
spark, df, TABLE_NAME, mode="overwrite", format_version=format_version
)
2023-04-03 14:57:49 +00:00
2024-08-07 16:46:33 +00:00
default_upload_directory(
started_cluster,
storage_type,
f"/iceberg_data/default/{TABLE_NAME}/",
2024-08-12 12:15:54 +00:00
f"/iceberg_data/default/{TABLE_NAME}/",
2024-07-31 10:29:12 +00:00
)
2023-04-03 14:57:49 +00:00
2024-08-07 16:46:33 +00:00
create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster)
2023-04-03 14:57:49 +00:00
assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 1
assert (
instance.query(f"SELECT a, b, c, d, e FROM {TABLE_NAME}").strip()
== "123\tstring\t2000-01-01\t['str1','str2']\ttrue"
)
2024-08-14 11:35:37 +00:00
table_function_expr = get_creation_expression(
storage_type, TABLE_NAME, started_cluster, table_function=True
2024-08-07 16:46:33 +00:00
)
2023-04-03 14:57:49 +00:00
assert (
2024-08-07 16:46:33 +00:00
instance.query(f"SELECT a, b, c, d, e FROM {table_function_expr}").strip()
2023-04-03 14:57:49 +00:00
== "123\tstring\t2000-01-01\t['str1','str2']\ttrue"
)
2024-08-07 16:46:33 +00:00
assert instance.query(f"DESCRIBE {table_function_expr} FORMAT TSV") == TSV(
2023-04-03 14:57:49 +00:00
[
["a", "Nullable(Int32)"],
["b", "Nullable(String)"],
2023-10-17 18:10:47 +00:00
["c", "Nullable(Date)"],
2023-04-03 14:57:49 +00:00
["d", "Array(Nullable(String))"],
["e", "Nullable(Bool)"],
]
)
2023-10-15 16:02:34 +00:00
2024-11-18 17:37:24 +00:00
@pytest.mark.parametrize("format_version", ["1", "2"])
@pytest.mark.parametrize("storage_type", ["s3", "azure", "hdfs"])
def test_cluster_table_function(started_cluster, format_version, storage_type):
if is_arm() and storage_type == "hdfs":
pytest.skip("Disabled test IcebergHDFS for aarch64")
instance = started_cluster.instances["node1"]
spark = started_cluster.spark_session
TABLE_NAME = (
"test_iceberg_cluster_"
+ format_version
+ "_"
+ storage_type
+ "_"
+ get_uuid_str()
)
def add_df(mode):
write_iceberg_from_df(
spark,
generate_data(spark, 0, 100),
TABLE_NAME,
mode=mode,
format_version=format_version,
)
files = default_upload_directory(
started_cluster,
storage_type,
f"/iceberg_data/default/{TABLE_NAME}/",
f"/iceberg_data/default/{TABLE_NAME}/",
)
logging.info(f"Adding another dataframe. result files: {files}")
return files
files = add_df(mode="overwrite")
for i in range(1, len(started_cluster.instances)):
files = add_df(mode="append")
logging.info(f"Setup complete. files: {files}")
assert len(files) == 5 + 4 * (len(started_cluster.instances) - 1)
clusters = instance.query(f"SELECT * FROM system.clusters")
logging.info(f"Clusters setup: {clusters}")
# Regular Query only node1
table_function_expr = get_creation_expression(
storage_type, TABLE_NAME, started_cluster, table_function=True
)
select_regular = instance.query(f"SELECT * FROM {table_function_expr}").strip().split()
# Cluster Query with node1 as coordinator
table_function_expr_cluster = get_creation_expression(
storage_type, TABLE_NAME, started_cluster, table_function=True, run_on_cluster=True
)
select_cluster = instance.query(f"SELECT * FROM {table_function_expr_cluster}").strip().split()
# Simple size check
assert len(select_regular) == 600
assert len(select_cluster) == 600
# Actual check
assert select_cluster == select_regular
# Check query_log
for replica in started_cluster.instances.values():
replica.query("SYSTEM FLUSH LOGS")
for node_name, replica in started_cluster.instances.items():
cluster_secondary_queries = replica.query(
f"""
SELECT query, type, is_initial_query, read_rows, read_bytes FROM system.query_log
WHERE
type = 'QueryStart' AND
positionCaseInsensitive(query, '{storage_type}Cluster') != 0 AND
position(query, '{TABLE_NAME}') != 0 AND
position(query, 'system.query_log') = 0 AND
NOT is_initial_query
"""
).strip().split("\n")
logging.info(f"[{node_name}] cluster_secondary_queries: {cluster_secondary_queries}")
assert len(cluster_secondary_queries) == 1
2023-10-15 16:02:34 +00:00
@pytest.mark.parametrize("format_version", ["1", "2"])
2024-10-02 11:15:16 +00:00
@pytest.mark.parametrize("storage_type", ["s3", "azure", "hdfs", "local"])
2024-08-07 16:46:33 +00:00
def test_delete_files(started_cluster, format_version, storage_type):
2024-10-10 10:58:38 +00:00
if is_arm() and storage_type == "hdfs":
pytest.skip("Disabled test IcebergHDFS for aarch64")
2023-10-15 16:02:34 +00:00
instance = started_cluster.instances["node1"]
spark = started_cluster.spark_session
minio_client = started_cluster.minio_client
bucket = started_cluster.minio_bucket
2024-08-14 11:35:37 +00:00
TABLE_NAME = (
"test_delete_files_"
+ format_version
+ "_"
+ storage_type
+ "_"
+ get_uuid_str()
)
2023-10-15 16:02:34 +00:00
write_iceberg_from_df(
spark,
generate_data(spark, 0, 100),
TABLE_NAME,
mode="overwrite",
format_version=format_version,
)
2024-08-07 16:46:33 +00:00
files = default_upload_directory(
started_cluster,
storage_type,
f"/iceberg_data/default/{TABLE_NAME}/",
2024-08-12 12:15:54 +00:00
f"/iceberg_data/default/{TABLE_NAME}/",
2023-10-15 16:02:34 +00:00
)
2024-08-07 16:46:33 +00:00
create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster)
2023-10-15 16:02:34 +00:00
assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100
spark.sql(f"DELETE FROM {TABLE_NAME} WHERE a >= 0")
2024-08-07 16:46:33 +00:00
files = default_upload_directory(
started_cluster,
storage_type,
f"/iceberg_data/default/{TABLE_NAME}/",
"",
2023-10-15 16:02:34 +00:00
)
assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 0
2023-10-16 15:30:20 +00:00
assert instance.contains_in_log("Processing delete file for path")
2023-10-15 16:02:34 +00:00
write_iceberg_from_df(
spark,
generate_data(spark, 100, 200),
TABLE_NAME,
mode="upsert",
format_version=format_version,
)
2024-08-07 16:46:33 +00:00
files = default_upload_directory(
started_cluster,
storage_type,
f"/iceberg_data/default/{TABLE_NAME}/",
"",
2023-10-15 16:02:34 +00:00
)
assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100
spark.sql(f"DELETE FROM {TABLE_NAME} WHERE a >= 150")
2024-08-07 16:46:33 +00:00
files = default_upload_directory(
started_cluster,
storage_type,
f"/iceberg_data/default/{TABLE_NAME}/",
"",
2023-10-15 16:02:34 +00:00
)
assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 50
2023-10-17 18:10:47 +00:00
@pytest.mark.parametrize("format_version", ["1", "2"])
2024-10-02 11:15:16 +00:00
@pytest.mark.parametrize("storage_type", ["s3", "azure", "hdfs", "local"])
2024-08-07 16:46:33 +00:00
def test_evolved_schema(started_cluster, format_version, storage_type):
2024-10-10 10:58:38 +00:00
if is_arm() and storage_type == "hdfs":
pytest.skip("Disabled test IcebergHDFS for aarch64")
2023-10-17 18:10:47 +00:00
instance = started_cluster.instances["node1"]
spark = started_cluster.spark_session
minio_client = started_cluster.minio_client
bucket = started_cluster.minio_bucket
2024-08-14 11:35:37 +00:00
TABLE_NAME = (
"test_evolved_schema_"
+ format_version
+ "_"
+ storage_type
+ "_"
+ get_uuid_str()
)
2023-10-17 18:10:47 +00:00
write_iceberg_from_df(
spark,
generate_data(spark, 0, 100),
TABLE_NAME,
mode="overwrite",
format_version=format_version,
)
2024-08-07 16:46:33 +00:00
files = default_upload_directory(
started_cluster,
storage_type,
f"/iceberg_data/default/{TABLE_NAME}/",
2024-08-12 12:15:54 +00:00
f"/iceberg_data/default/{TABLE_NAME}/",
2023-10-17 18:10:47 +00:00
)
2024-08-07 16:46:33 +00:00
create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster)
2023-10-17 18:10:47 +00:00
assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100
expected_data = instance.query(f"SELECT * FROM {TABLE_NAME} order by a, b")
2023-10-17 18:10:47 +00:00
spark.sql(f"ALTER TABLE {TABLE_NAME} ADD COLUMNS (x bigint)")
2024-08-07 16:46:33 +00:00
files = default_upload_directory(
started_cluster,
storage_type,
f"/iceberg_data/default/{TABLE_NAME}/",
"",
2023-10-17 18:10:47 +00:00
)
error = instance.query_and_get_error(f"SELECT * FROM {TABLE_NAME}")
assert "UNSUPPORTED_METHOD" in error
data = instance.query(
f"SELECT * FROM {TABLE_NAME} SETTINGS iceberg_engine_ignore_schema_evolution=1"
)
assert data == expected_data
2023-10-18 11:09:39 +00:00
2024-10-02 11:15:16 +00:00
@pytest.mark.parametrize("storage_type", ["s3", "azure", "hdfs", "local"])
2024-08-07 16:46:33 +00:00
def test_row_based_deletes(started_cluster, storage_type):
2024-10-10 10:58:38 +00:00
if is_arm() and storage_type == "hdfs":
pytest.skip("Disabled test IcebergHDFS for aarch64")
2023-10-17 18:10:47 +00:00
instance = started_cluster.instances["node1"]
spark = started_cluster.spark_session
minio_client = started_cluster.minio_client
bucket = started_cluster.minio_bucket
2024-08-14 11:35:37 +00:00
TABLE_NAME = "test_row_based_deletes_" + storage_type + "_" + get_uuid_str()
2023-10-17 18:10:47 +00:00
spark.sql(
f"CREATE TABLE {TABLE_NAME} (id bigint, data string) USING iceberg TBLPROPERTIES ('format-version' = '2', 'write.update.mode'='merge-on-read', 'write.delete.mode'='merge-on-read', 'write.merge.mode'='merge-on-read')"
)
2023-10-18 11:09:39 +00:00
spark.sql(
f"INSERT INTO {TABLE_NAME} select id, char(id + ascii('a')) from range(100)"
)
2023-10-17 18:10:47 +00:00
2024-08-07 16:46:33 +00:00
files = default_upload_directory(
started_cluster,
storage_type,
f"/iceberg_data/default/{TABLE_NAME}/",
2024-08-12 12:15:54 +00:00
f"/iceberg_data/default/{TABLE_NAME}/",
2023-10-17 18:10:47 +00:00
)
2024-08-07 16:46:33 +00:00
create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster)
2023-10-17 18:10:47 +00:00
assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100
spark.sql(f"DELETE FROM {TABLE_NAME} WHERE id < 10")
2024-08-07 16:46:33 +00:00
files = default_upload_directory(
started_cluster,
storage_type,
f"/iceberg_data/default/{TABLE_NAME}/",
"",
2023-10-17 18:10:47 +00:00
)
error = instance.query_and_get_error(f"SELECT * FROM {TABLE_NAME}")
assert "UNSUPPORTED_METHOD" in error
2023-10-18 11:09:39 +00:00
2023-10-17 18:10:47 +00:00
@pytest.mark.parametrize("format_version", ["1", "2"])
2024-10-02 11:15:16 +00:00
@pytest.mark.parametrize("storage_type", ["s3", "azure", "hdfs", "local"])
2024-08-07 16:46:33 +00:00
def test_schema_inference(started_cluster, format_version, storage_type):
2024-10-10 10:58:38 +00:00
if is_arm() and storage_type == "hdfs":
pytest.skip("Disabled test IcebergHDFS for aarch64")
2023-10-17 18:10:47 +00:00
instance = started_cluster.instances["node1"]
spark = started_cluster.spark_session
for format in ["Parquet", "ORC", "Avro"]:
2024-08-07 16:46:33 +00:00
TABLE_NAME = (
"test_schema_inference_"
+ format
+ "_"
+ format_version
+ "_"
+ storage_type
2024-08-14 11:35:37 +00:00
+ "_"
+ get_uuid_str()
2024-08-07 16:46:33 +00:00
)
2023-10-17 18:10:47 +00:00
# Types time, timestamptz, fixed are not supported in Spark.
spark.sql(
f"CREATE TABLE {TABLE_NAME} (intC int, longC long, floatC float, doubleC double, decimalC1 decimal(10, 3), decimalC2 decimal(20, 10), decimalC3 decimal(38, 30), dateC date, timestampC timestamp, stringC string, binaryC binary, arrayC1 array<int>, mapC1 map<string, string>, structC1 struct<field1: int, field2: string>, complexC array<struct<field1: map<string, array<map<string, int>>>, field2: struct<field3: int, field4: string>>>) USING iceberg TBLPROPERTIES ('format-version' = '{format_version}', 'write.format.default' = '{format}')"
)
spark.sql(
f"insert into {TABLE_NAME} select 42, 4242, 42.42, 4242.4242, decimal(42.42), decimal(42.42), decimal(42.42), date('2020-01-01'), timestamp('2020-01-01 20:00:00'), 'hello', binary('hello'), array(1,2,3), map('key', 'value'), struct(42, 'hello'), array(struct(map('key', array(map('key', 42))), struct(42, 'hello')))"
)
2024-08-07 16:46:33 +00:00
files = default_upload_directory(
started_cluster,
storage_type,
f"/iceberg_data/default/{TABLE_NAME}/",
2024-08-12 12:15:54 +00:00
f"/iceberg_data/default/{TABLE_NAME}/",
2023-10-17 18:10:47 +00:00
)
2024-08-07 16:46:33 +00:00
create_iceberg_table(
storage_type, instance, TABLE_NAME, started_cluster, format=format
)
2023-10-17 18:10:47 +00:00
2024-01-06 15:55:20 +00:00
res = instance.query(
f"DESC {TABLE_NAME} FORMAT TSVRaw", settings={"print_pretty_type_names": 0}
)
2023-10-17 18:10:47 +00:00
expected = TSV(
[
["intC", "Nullable(Int32)"],
["longC", "Nullable(Int64)"],
["floatC", "Nullable(Float32)"],
["doubleC", "Nullable(Float64)"],
["decimalC1", "Nullable(Decimal(10, 3))"],
["decimalC2", "Nullable(Decimal(20, 10))"],
["decimalC3", "Nullable(Decimal(38, 30))"],
["dateC", "Nullable(Date)"],
2023-10-18 11:09:39 +00:00
["timestampC", "Nullable(DateTime64(6, 'UTC'))"],
2023-10-17 18:10:47 +00:00
["stringC", "Nullable(String)"],
["binaryC", "Nullable(String)"],
["arrayC1", "Array(Nullable(Int32))"],
["mapC1", "Map(String, Nullable(String))"],
["structC1", "Tuple(field1 Nullable(Int32), field2 Nullable(String))"],
2023-10-18 11:09:39 +00:00
[
"complexC",
"Array(Tuple(field1 Map(String, Array(Map(String, Nullable(Int32)))), field2 Tuple(field3 Nullable(Int32), field4 Nullable(String))))",
],
2023-10-17 18:10:47 +00:00
]
)
assert res == expected
# Check that we can parse data
instance.query(f"SELECT * FROM {TABLE_NAME}")
@pytest.mark.parametrize("format_version", ["1", "2"])
2024-10-02 11:18:41 +00:00
@pytest.mark.parametrize("storage_type", ["s3", "azure", "hdfs", "local"])
2024-08-07 16:46:33 +00:00
def test_metadata_file_selection(started_cluster, format_version, storage_type):
2024-10-10 10:58:38 +00:00
if is_arm() and storage_type == "hdfs":
pytest.skip("Disabled test IcebergHDFS for aarch64")
2023-10-17 18:10:47 +00:00
instance = started_cluster.instances["node1"]
spark = started_cluster.spark_session
2024-08-14 11:35:37 +00:00
TABLE_NAME = (
"test_metadata_selection_"
+ format_version
+ "_"
+ storage_type
+ "_"
+ get_uuid_str()
)
2023-10-17 18:10:47 +00:00
spark.sql(
f"CREATE TABLE {TABLE_NAME} (id bigint, data string) USING iceberg TBLPROPERTIES ('format-version' = '2', 'write.update.mode'='merge-on-read', 'write.delete.mode'='merge-on-read', 'write.merge.mode'='merge-on-read')"
)
for i in range(50):
2023-10-18 11:09:39 +00:00
spark.sql(
f"INSERT INTO {TABLE_NAME} select id, char(id + ascii('a')) from range(10)"
)
2023-10-17 18:10:47 +00:00
2024-08-07 16:46:33 +00:00
files = default_upload_directory(
started_cluster,
storage_type,
f"/iceberg_data/default/{TABLE_NAME}/",
2024-08-12 12:15:54 +00:00
f"/iceberg_data/default/{TABLE_NAME}/",
2023-10-17 18:10:47 +00:00
)
2024-08-07 16:46:33 +00:00
create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster)
2023-10-17 18:10:47 +00:00
assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 500
@pytest.mark.parametrize("format_version", ["1", "2"])
2024-10-02 11:18:41 +00:00
@pytest.mark.parametrize("storage_type", ["s3", "azure", "hdfs", "local"])
2024-08-07 16:46:33 +00:00
def test_metadata_file_format_with_uuid(started_cluster, format_version, storage_type):
2024-10-10 10:58:38 +00:00
if is_arm() and storage_type == "hdfs":
pytest.skip("Disabled test IcebergHDFS for aarch64")
instance = started_cluster.instances["node1"]
spark = started_cluster.spark_session
2024-08-07 16:46:33 +00:00
TABLE_NAME = (
2024-08-14 11:35:37 +00:00
"test_metadata_selection_with_uuid_"
+ format_version
+ "_"
+ storage_type
+ "_"
+ get_uuid_str()
2024-08-07 16:46:33 +00:00
)
spark.sql(
f"CREATE TABLE {TABLE_NAME} (id bigint, data string) USING iceberg TBLPROPERTIES ('format-version' = '2', 'write.update.mode'='merge-on-read', 'write.delete.mode'='merge-on-read', 'write.merge.mode'='merge-on-read')"
)
for i in range(50):
spark.sql(
f"INSERT INTO {TABLE_NAME} select id, char(id + ascii('a')) from range(10)"
)
for i in range(50):
os.rename(
f"/iceberg_data/default/{TABLE_NAME}/metadata/v{i + 1}.metadata.json",
2024-08-14 11:35:37 +00:00
f"/iceberg_data/default/{TABLE_NAME}/metadata/{str(i).zfill(5)}-{get_uuid_str()}.metadata.json",
)
2024-08-07 16:46:33 +00:00
files = default_upload_directory(
2024-01-22 17:24:48 +00:00
started_cluster,
2024-08-07 16:46:33 +00:00
storage_type,
f"/iceberg_data/default/{TABLE_NAME}/",
2024-08-12 12:15:54 +00:00
f"/iceberg_data/default/{TABLE_NAME}/",
2024-01-22 17:24:48 +00:00
)
2024-08-07 16:46:33 +00:00
create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster)
2024-01-22 17:24:48 +00:00
2024-08-07 16:46:33 +00:00
assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 500
2024-01-22 17:24:48 +00:00
2024-08-12 12:15:54 +00:00
def test_restart_broken_s3(started_cluster):
instance = started_cluster.instances["node1"]
spark = started_cluster.spark_session
2024-08-14 11:35:37 +00:00
TABLE_NAME = "test_restart_broken_table_function_s3" + "_" + get_uuid_str()
2024-08-12 12:15:54 +00:00
minio_client = started_cluster.minio_client
bucket = "broken2"
if not minio_client.bucket_exists(bucket):
minio_client.make_bucket(bucket)
write_iceberg_from_df(
spark,
generate_data(spark, 0, 100),
TABLE_NAME,
mode="overwrite",
format_version="1",
)
files = default_upload_directory(
started_cluster,
"s3",
f"/iceberg_data/default/{TABLE_NAME}/",
f"/iceberg_data/default/{TABLE_NAME}/",
bucket=bucket,
)
create_iceberg_table("s3", instance, TABLE_NAME, started_cluster, bucket=bucket)
assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100
s3_objects = list_s3_objects(minio_client, bucket, prefix="")
assert (
len(
list(
minio_client.remove_objects(
bucket,
[DeleteObject(obj) for obj in s3_objects],
)
)
)
== 0
)
minio_client.remove_bucket(bucket)
instance.restart_clickhouse()
assert "NoSuchBucket" in instance.query_and_get_error(
f"SELECT count() FROM {TABLE_NAME}"
)
minio_client.make_bucket(bucket)
files = default_upload_directory(
started_cluster,
"s3",
f"/iceberg_data/default/{TABLE_NAME}/",
f"/iceberg_data/default/{TABLE_NAME}/",
bucket=bucket,
)
assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100
2024-10-03 15:42:50 +00:00
@pytest.mark.parametrize("storage_type", ["s3"])
def test_filesystem_cache(started_cluster, storage_type):
instance = started_cluster.instances["node1"]
spark = started_cluster.spark_session
TABLE_NAME = "test_filesystem_cache_" + storage_type + "_" + get_uuid_str()
write_iceberg_from_df(
spark,
generate_data(spark, 0, 10),
TABLE_NAME,
mode="overwrite",
format_version="1",
partition_by="a",
)
default_upload_directory(
started_cluster,
storage_type,
f"/iceberg_data/default/{TABLE_NAME}/",
f"/iceberg_data/default/{TABLE_NAME}/",
)
create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster)
query_id = f"{TABLE_NAME}-{uuid.uuid4()}"
instance.query(
f"SELECT * FROM {TABLE_NAME} SETTINGS filesystem_cache_name = 'cache1'",
query_id=query_id,
)
instance.query("SYSTEM FLUSH LOGS")
count = int(
instance.query(
f"SELECT ProfileEvents['CachedReadBufferCacheWriteBytes'] FROM system.query_log WHERE query_id = '{query_id}' AND type = 'QueryFinish'"
)
)
2024-10-03 16:01:15 +00:00
assert 0 < int(
instance.query(
f"SELECT ProfileEvents['S3GetObject'] FROM system.query_log WHERE query_id = '{query_id}' AND type = 'QueryFinish'"
)
)
2024-10-03 15:42:50 +00:00
query_id = f"{TABLE_NAME}-{uuid.uuid4()}"
instance.query(
f"SELECT * FROM {TABLE_NAME} SETTINGS filesystem_cache_name = 'cache1'",
query_id=query_id,
)
instance.query("SYSTEM FLUSH LOGS")
assert count == int(
instance.query(
f"SELECT ProfileEvents['CachedReadBufferReadFromCacheBytes'] FROM system.query_log WHERE query_id = '{query_id}' AND type = 'QueryFinish'"
)
)
2024-10-03 16:01:15 +00:00
assert 0 == int(
instance.query(
f"SELECT ProfileEvents['S3GetObject'] FROM system.query_log WHERE query_id = '{query_id}' AND type = 'QueryFinish'"
)
)