add test for iceberg

2024-12-17 20:02:05 +00:00 · 2024-11-18 17:37:24 +00:00 · 2024-11-18 17:37:24 +00:00 · a29ded4941
commit a29ded4941
parent d2efae7511
3 changed files with 181 additions and 20 deletions
--- a/tests/integration/test_storage_iceberg/configs/config.d/cluster.xml
+++ b/tests/integration/test_storage_iceberg/configs/config.d/cluster.xml
@ -0,0 +1,20 @@
+<clickhouse>
+    <remote_servers>
+        <cluster_simple>
+            <shard>
+                <replica>
+                    <host>node1</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>node2</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>node3</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+        </cluster_simple>
+    </remote_servers>
+</clickhouse>
--- a/tests/integration/test_storage_iceberg/configs/config.d/query_log.xml
+++ b/tests/integration/test_storage_iceberg/configs/config.d/query_log.xml
@ -0,0 +1,6 @@
+<clickhouse>
+    <query_log>
+      <database>system</database>
+      <table>query_log</table>
+    </query_log>
+</clickhouse>
--- a/tests/integration/test_storage_iceberg/test.py
+++ b/tests/integration/test_storage_iceberg/test.py
@ -73,14 +73,38 @@ def started_cluster():
        cluster.add_instance(
            "node1",
            main_configs=[
+                "configs/config.d/query_log.xml",
+                "configs/config.d/cluster.xml",
                "configs/config.d/named_collections.xml",
                "configs/config.d/filesystem_caches.xml",
            ],
            user_configs=["configs/users.d/users.xml"],
            with_minio=True,
            with_azurite=True,
-            stay_alive=True,
            with_hdfs=with_hdfs,
+            stay_alive=True,
+        )
+        cluster.add_instance(
+            "node2",
+            main_configs=[
+                "configs/config.d/query_log.xml",
+                "configs/config.d/cluster.xml",
+                "configs/config.d/named_collections.xml",
+                "configs/config.d/filesystem_caches.xml",
+            ],
+            user_configs=["configs/users.d/users.xml"],
+            stay_alive=True,
+        )
+        cluster.add_instance(
+            "node3",
+            main_configs=[
+                "configs/config.d/query_log.xml",
+                "configs/config.d/cluster.xml",
+                "configs/config.d/named_collections.xml",
+                "configs/config.d/filesystem_caches.xml",
+            ],
+            user_configs=["configs/users.d/users.xml"],
+            stay_alive=True,
        )

        logging.info("Starting cluster...")
@ -182,6 +206,7 @@ def get_creation_expression(
    cluster,
    format="Parquet",
    table_function=False,
+    run_on_cluster=False,
    **kwargs,
 ):
    if storage_type == "s3":
@ -189,35 +214,56 @@ def get_creation_expression(
            bucket = kwargs["bucket"]
        else:
            bucket = cluster.minio_bucket
-        print(bucket)
-        if table_function:
-            return f"icebergS3(s3, filename = 'iceberg_data/default/{table_name}/', format={format}, url = 'http://minio1:9001/{bucket}/')"
+
+        if run_on_cluster:
+            assert table_function
+            return f"icebergS3Cluster('cluster_simple', s3, filename = 'iceberg_data/default/{table_name}/', format={format}, url = 'http://minio1:9001/{bucket}/')"
        else:
-            return f"""
-                DROP TABLE IF EXISTS {table_name};
-                CREATE TABLE {table_name}
-                ENGINE=IcebergS3(s3, filename = 'iceberg_data/default/{table_name}/', format={format}, url = 'http://minio1:9001/{bucket}/')"""
+            if table_function:
+                return f"icebergS3(s3, filename = 'iceberg_data/default/{table_name}/', format={format}, url = 'http://minio1:9001/{bucket}/')"
+            else:
+                return f"""
+                    DROP TABLE IF EXISTS {table_name};
+                    CREATE TABLE {table_name}
+                    ENGINE=IcebergS3(s3, filename = 'iceberg_data/default/{table_name}/', format={format}, url = 'http://minio1:9001/{bucket}/')"""
+
    elif storage_type == "azure":
-        if table_function:
+        if run_on_cluster:
+            assert table_function
            return f"""
-                icebergAzure(azure, container = '{cluster.azure_container_name}', storage_account_url = '{cluster.env_variables["AZURITE_STORAGE_ACCOUNT_URL"]}', blob_path = '/iceberg_data/default/{table_name}/', format={format})
+                icebergAzureCluster('cluster_simple', azure, container = '{cluster.azure_container_name}', storage_account_url = '{cluster.env_variables["AZURITE_STORAGE_ACCOUNT_URL"]}', blob_path = '/iceberg_data/default/{table_name}/', format={format})
            """
        else:
-            return f"""
-                DROP TABLE IF EXISTS {table_name};
-                CREATE TABLE {table_name}
-                ENGINE=IcebergAzure(azure, container = {cluster.azure_container_name}, storage_account_url = '{cluster.env_variables["AZURITE_STORAGE_ACCOUNT_URL"]}', blob_path = '/iceberg_data/default/{table_name}/', format={format})"""
+            if table_function:
+                return f"""
+                    icebergAzure(azure, container = '{cluster.azure_container_name}', storage_account_url = '{cluster.env_variables["AZURITE_STORAGE_ACCOUNT_URL"]}', blob_path = '/iceberg_data/default/{table_name}/', format={format})
+                """
+            else:
+                return f"""
+                    DROP TABLE IF EXISTS {table_name};
+                    CREATE TABLE {table_name}
+                    ENGINE=IcebergAzure(azure, container = {cluster.azure_container_name}, storage_account_url = '{cluster.env_variables["AZURITE_STORAGE_ACCOUNT_URL"]}', blob_path = '/iceberg_data/default/{table_name}/', format={format})"""
+
    elif storage_type == "hdfs":
-        if table_function:
+        if run_on_cluster:
+            assert table_function
            return f"""
-                icebergHDFS(hdfs, filename= 'iceberg_data/default/{table_name}/', format={format}, url = 'hdfs://hdfs1:9000/')
+                icebergHDFSCluster('cluster_simple', hdfs, filename= 'iceberg_data/default/{table_name}/', format={format}, url = 'hdfs://hdfs1:9000/')
            """
        else:
-            return f"""
-                DROP TABLE IF EXISTS {table_name};
-                CREATE TABLE {table_name}
-                ENGINE=IcebergHDFS(hdfs, filename = 'iceberg_data/default/{table_name}/', format={format}, url = 'hdfs://hdfs1:9000/');"""
+            if table_function:
+                return f"""
+                    icebergHDFS(hdfs, filename= 'iceberg_data/default/{table_name}/', format={format}, url = 'hdfs://hdfs1:9000/')
+                """
+            else:
+                return f"""
+                    DROP TABLE IF EXISTS {table_name};
+                    CREATE TABLE {table_name}
+                    ENGINE=IcebergHDFS(hdfs, filename = 'iceberg_data/default/{table_name}/', format={format}, url = 'hdfs://hdfs1:9000/');"""
+
    elif storage_type == "local":
+        assert not run_on_cluster
+
        if table_function:
            return f"""
                icebergLocal(local, path = '/iceberg_data/default/{table_name}/', format={format})
@ -227,6 +273,7 @@ def get_creation_expression(
                DROP TABLE IF EXISTS {table_name};
                CREATE TABLE {table_name}
                ENGINE=IcebergLocal(local, path = '/iceberg_data/default/{table_name}/', format={format});"""
+
    else:
        raise Exception(f"Unknown iceberg storage type: {storage_type}")

@ -492,6 +539,94 @@ def test_types(started_cluster, format_version, storage_type):
    )


+@pytest.mark.parametrize("format_version", ["1", "2"])
+@pytest.mark.parametrize("storage_type", ["s3", "azure", "hdfs"])
+def test_cluster_table_function(started_cluster, format_version, storage_type):
+    if is_arm() and storage_type == "hdfs":
+        pytest.skip("Disabled test IcebergHDFS for aarch64")
+
+    instance = started_cluster.instances["node1"]
+    spark = started_cluster.spark_session
+
+    TABLE_NAME = (
+        "test_iceberg_cluster_"
+        + format_version
+        + "_"
+        + storage_type
+        + "_"
+        + get_uuid_str()
+    )
+
+    def add_df(mode):
+        write_iceberg_from_df(
+            spark,
+            generate_data(spark, 0, 100),
+            TABLE_NAME,
+            mode=mode,
+            format_version=format_version,
+        )
+
+        files = default_upload_directory(
+            started_cluster,
+            storage_type,
+            f"/iceberg_data/default/{TABLE_NAME}/",
+            f"/iceberg_data/default/{TABLE_NAME}/",
+        )
+
+        logging.info(f"Adding another dataframe. result files: {files}")
+
+        return files
+
+    files = add_df(mode="overwrite")
+    for i in range(1, len(started_cluster.instances)):
+        files = add_df(mode="append")
+
+    logging.info(f"Setup complete. files: {files}")
+    assert len(files) == 5 + 4 * (len(started_cluster.instances) - 1)
+
+    clusters = instance.query(f"SELECT * FROM system.clusters")
+    logging.info(f"Clusters setup: {clusters}")
+
+    # Regular Query only node1
+    table_function_expr = get_creation_expression(
+        storage_type, TABLE_NAME, started_cluster, table_function=True
+    )
+    select_regular = instance.query(f"SELECT * FROM {table_function_expr}").strip().split()
+
+    # Cluster Query with node1 as coordinator
+    table_function_expr_cluster = get_creation_expression(
+        storage_type, TABLE_NAME, started_cluster, table_function=True, run_on_cluster=True
+    )
+    select_cluster = instance.query(f"SELECT * FROM {table_function_expr_cluster}").strip().split()
+
+    # Simple size check
+    assert len(select_regular) == 600
+    assert len(select_cluster) == 600
+
+    # Actual check
+    assert select_cluster == select_regular
+
+    # Check query_log
+    for replica in started_cluster.instances.values():
+        replica.query("SYSTEM FLUSH LOGS")
+
+    for node_name, replica in started_cluster.instances.items():
+        cluster_secondary_queries = replica.query(
+            f"""
+                SELECT query, type, is_initial_query, read_rows, read_bytes FROM system.query_log
+                WHERE
+                    type = 'QueryStart' AND
+                    positionCaseInsensitive(query, '{storage_type}Cluster') != 0 AND
+                    position(query, '{TABLE_NAME}') != 0 AND
+                    position(query, 'system.query_log') = 0 AND
+                    NOT is_initial_query
+            """
+        ).strip().split("\n")
+
+        logging.info(f"[{node_name}] cluster_secondary_queries: {cluster_secondary_queries}")
+        assert len(cluster_secondary_queries) == 1
+
+
@pytest.mark.parametrize("format_version", ["1", "2"])
@pytest.mark.parametrize("storage_type", ["s3", "azure", "hdfs", "local"])
 def test_delete_files(started_cluster, format_version, storage_type):