add test for iceberg

2024-12-18 04:12:19 +00:00 · 2024-11-18 17:37:24 +00:00 · 2024-11-18 17:37:24 +00:00 · a29ded4941
commit a29ded4941
parent d2efae7511
3 changed files with 181 additions and 20 deletions
--- a/tests/integration/test_storage_iceberg/configs/config.d/cluster.xml
+++ b/tests/integration/test_storage_iceberg/configs/config.d/cluster.xml
@ -0,0 +1,20 @@
 <clickhouse>
    <remote_servers>
        <cluster_simple>
            <shard>
                <replica>
                    <host>node1</host>
                    <port>9000</port>
                </replica>
                <replica>
                    <host>node2</host>
                    <port>9000</port>
                </replica>
                <replica>
                    <host>node3</host>
                    <port>9000</port>
                </replica>
            </shard>
        </cluster_simple>
    </remote_servers>
 </clickhouse>
--- a/tests/integration/test_storage_iceberg/configs/config.d/query_log.xml
+++ b/tests/integration/test_storage_iceberg/configs/config.d/query_log.xml
@ -0,0 +1,6 @@
 <clickhouse>
    <query_log>
      <database>system</database>
      <table>query_log</table>
    </query_log>
 </clickhouse>
--- a/tests/integration/test_storage_iceberg/test.py
+++ b/tests/integration/test_storage_iceberg/test.py
@ -73,14 +73,38 @@ def started_cluster():
        cluster.add_instance(
            "node1",
            main_configs=[
                "configs/config.d/query_log.xml",
                "configs/config.d/cluster.xml",
                "configs/config.d/named_collections.xml",
                "configs/config.d/filesystem_caches.xml",
            ],
            user_configs=["configs/users.d/users.xml"],
            with_minio=True,
            with_azurite=True,
            stay_alive=True,
            with_hdfs=with_hdfs,
            stay_alive=True,
        )
        cluster.add_instance(
            "node2",
            main_configs=[
                "configs/config.d/query_log.xml",
                "configs/config.d/cluster.xml",
                "configs/config.d/named_collections.xml",
                "configs/config.d/filesystem_caches.xml",
            ],
            user_configs=["configs/users.d/users.xml"],
            stay_alive=True,
        )
        cluster.add_instance(
            "node3",
            main_configs=[
                "configs/config.d/query_log.xml",
                "configs/config.d/cluster.xml",
                "configs/config.d/named_collections.xml",
                "configs/config.d/filesystem_caches.xml",
            ],
            user_configs=["configs/users.d/users.xml"],
            stay_alive=True,
        )
        logging.info("Starting cluster...")
@ -182,6 +206,7 @@ def get_creation_expression(
    cluster,
    format="Parquet",
    table_function=False,
    run_on_cluster=False,
    **kwargs,
 ):
    if storage_type == "s3":
@ -189,35 +214,56 @@ def get_creation_expression(
            bucket = kwargs["bucket"]
        else:
            bucket = cluster.minio_bucket
-        print(bucket)
+
-        if table_function:
+        if run_on_cluster:
-            return f"icebergS3(s3, filename = 'iceberg_data/default/{table_name}/', format={format}, url = 'http://minio1:9001/{bucket}/')"
+            assert table_function
            return f"icebergS3Cluster('cluster_simple', s3, filename = 'iceberg_data/default/{table_name}/', format={format}, url = 'http://minio1:9001/{bucket}/')"
        else:
-            return f"""
+            if table_function:
-                DROP TABLE IF EXISTS {table_name};
+                return f"icebergS3(s3, filename = 'iceberg_data/default/{table_name}/', format={format}, url = 'http://minio1:9001/{bucket}/')"
-                CREATE TABLE {table_name}
+            else:
-                ENGINE=IcebergS3(s3, filename = 'iceberg_data/default/{table_name}/', format={format}, url = 'http://minio1:9001/{bucket}/')"""
+                return f"""
                    DROP TABLE IF EXISTS {table_name};
                    CREATE TABLE {table_name}
                    ENGINE=IcebergS3(s3, filename = 'iceberg_data/default/{table_name}/', format={format}, url = 'http://minio1:9001/{bucket}/')"""
    elif storage_type == "azure":
-        if table_function:
+        if run_on_cluster:
            assert table_function
            return f"""
-                icebergAzure(azure, container = '{cluster.azure_container_name}', storage_account_url = '{cluster.env_variables["AZURITE_STORAGE_ACCOUNT_URL"]}', blob_path = '/iceberg_data/default/{table_name}/', format={format})
+                icebergAzureCluster('cluster_simple', azure, container = '{cluster.azure_container_name}', storage_account_url = '{cluster.env_variables["AZURITE_STORAGE_ACCOUNT_URL"]}', blob_path = '/iceberg_data/default/{table_name}/', format={format})
            """
        else:
-            return f"""
+            if table_function:
-                DROP TABLE IF EXISTS {table_name};
+                return f"""
-                CREATE TABLE {table_name}
+                    icebergAzure(azure, container = '{cluster.azure_container_name}', storage_account_url = '{cluster.env_variables["AZURITE_STORAGE_ACCOUNT_URL"]}', blob_path = '/iceberg_data/default/{table_name}/', format={format})
-                ENGINE=IcebergAzure(azure, container = {cluster.azure_container_name}, storage_account_url = '{cluster.env_variables["AZURITE_STORAGE_ACCOUNT_URL"]}', blob_path = '/iceberg_data/default/{table_name}/', format={format})"""
+                """
            else:
                return f"""
                    DROP TABLE IF EXISTS {table_name};
                    CREATE TABLE {table_name}
                    ENGINE=IcebergAzure(azure, container = {cluster.azure_container_name}, storage_account_url = '{cluster.env_variables["AZURITE_STORAGE_ACCOUNT_URL"]}', blob_path = '/iceberg_data/default/{table_name}/', format={format})"""
    elif storage_type == "hdfs":
-        if table_function:
+        if run_on_cluster:
            assert table_function
            return f"""
-                icebergHDFS(hdfs, filename= 'iceberg_data/default/{table_name}/', format={format}, url = 'hdfs://hdfs1:9000/')
+                icebergHDFSCluster('cluster_simple', hdfs, filename= 'iceberg_data/default/{table_name}/', format={format}, url = 'hdfs://hdfs1:9000/')
            """
        else:
-            return f"""
+            if table_function:
-                DROP TABLE IF EXISTS {table_name};
+                return f"""
-                CREATE TABLE {table_name}
+                    icebergHDFS(hdfs, filename= 'iceberg_data/default/{table_name}/', format={format}, url = 'hdfs://hdfs1:9000/')
-                ENGINE=IcebergHDFS(hdfs, filename = 'iceberg_data/default/{table_name}/', format={format}, url = 'hdfs://hdfs1:9000/');"""
+                """
            else:
                return f"""
                    DROP TABLE IF EXISTS {table_name};
                    CREATE TABLE {table_name}
                    ENGINE=IcebergHDFS(hdfs, filename = 'iceberg_data/default/{table_name}/', format={format}, url = 'hdfs://hdfs1:9000/');"""
    elif storage_type == "local":
        assert not run_on_cluster
        if table_function:
            return f"""
                icebergLocal(local, path = '/iceberg_data/default/{table_name}/', format={format})
@ -227,6 +273,7 @@ def get_creation_expression(
                DROP TABLE IF EXISTS {table_name};
                CREATE TABLE {table_name}
                ENGINE=IcebergLocal(local, path = '/iceberg_data/default/{table_name}/', format={format});"""
    else:
        raise Exception(f"Unknown iceberg storage type: {storage_type}")
@ -492,6 +539,94 @@ def test_types(started_cluster, format_version, storage_type):
    )
@pytest.mark.parametrize("format_version", ["1", "2"])
@pytest.mark.parametrize("storage_type", ["s3", "azure", "hdfs"])
 def test_cluster_table_function(started_cluster, format_version, storage_type):
    if is_arm() and storage_type == "hdfs":
        pytest.skip("Disabled test IcebergHDFS for aarch64")
    instance = started_cluster.instances["node1"]
    spark = started_cluster.spark_session
    TABLE_NAME = (
        "test_iceberg_cluster_"
        + format_version
        + "_"
        + storage_type
        + "_"
        + get_uuid_str()
    )
    def add_df(mode):
        write_iceberg_from_df(
            spark,
            generate_data(spark, 0, 100),
            TABLE_NAME,
            mode=mode,
            format_version=format_version,
        )
        files = default_upload_directory(
            started_cluster,
            storage_type,
            f"/iceberg_data/default/{TABLE_NAME}/",
            f"/iceberg_data/default/{TABLE_NAME}/",
        )
        logging.info(f"Adding another dataframe. result files: {files}")
        return files
    files = add_df(mode="overwrite")
    for i in range(1, len(started_cluster.instances)):
        files = add_df(mode="append")
    logging.info(f"Setup complete. files: {files}")
    assert len(files) == 5 + 4 * (len(started_cluster.instances) - 1)
    clusters = instance.query(f"SELECT * FROM system.clusters")
    logging.info(f"Clusters setup: {clusters}")
    # Regular Query only node1
    table_function_expr = get_creation_expression(
        storage_type, TABLE_NAME, started_cluster, table_function=True
    )
    select_regular = instance.query(f"SELECT * FROM {table_function_expr}").strip().split()
    # Cluster Query with node1 as coordinator
    table_function_expr_cluster = get_creation_expression(
        storage_type, TABLE_NAME, started_cluster, table_function=True, run_on_cluster=True
    )
    select_cluster = instance.query(f"SELECT * FROM {table_function_expr_cluster}").strip().split()
    # Simple size check
    assert len(select_regular) == 600
    assert len(select_cluster) == 600
    # Actual check
    assert select_cluster == select_regular
    # Check query_log
    for replica in started_cluster.instances.values():
        replica.query("SYSTEM FLUSH LOGS")
    for node_name, replica in started_cluster.instances.items():
        cluster_secondary_queries = replica.query(
            f"""
                SELECT query, type, is_initial_query, read_rows, read_bytes FROM system.query_log
                WHERE
                    type = 'QueryStart' AND
                    positionCaseInsensitive(query, '{storage_type}Cluster') != 0 AND
                    position(query, '{TABLE_NAME}') != 0 AND
                    position(query, 'system.query_log') = 0 AND
                    NOT is_initial_query
            """
        ).strip().split("\n")
        logging.info(f"[{node_name}] cluster_secondary_queries: {cluster_secondary_queries}")
        assert len(cluster_secondary_queries) == 1
@pytest.mark.parametrize("format_version", ["1", "2"])
@pytest.mark.parametrize("storage_type", ["s3", "azure", "hdfs", "local"])
 def test_delete_files(started_cluster, format_version, storage_type):