ClickHouse/tests/integration/test_storage_iceberg/test.py

import helpers.client
from helpers.cluster import ClickHouseCluster
from helpers.test_tools import TSV

import pyspark
import logging
import os
import json
import pytest
import time
import glob

from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    DateType,
    TimestampType,
    BooleanType,
    ArrayType,
)
from pyspark.sql.functions import current_timestamp
from datetime import datetime
from pyspark.sql.functions import monotonically_increasing_id, row_number
from pyspark.sql.window import Window
from pyspark.sql.readwriter import DataFrameWriter, DataFrameWriterV2

from helpers.s3_tools import prepare_s3_bucket, upload_directory, get_file_contents

SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))


def get_spark():
    builder = (
        pyspark.sql.SparkSession.builder.appName("spark_test")
        .config(
            "spark.sql.catalog.spark_catalog",
            "org.apache.iceberg.spark.SparkSessionCatalog",
        )
        .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog")
        .config("spark.sql.catalog.spark_catalog.type", "hadoop")
        .config("spark.sql.catalog.spark_catalog.warehouse", "/iceberg_data")
        .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
        .master("local")
    )
    return builder.master("local").getOrCreate()


@pytest.fixture(scope="module")
def started_cluster():
    try:
        cluster = ClickHouseCluster(__file__, with_spark=True)
        cluster.add_instance(
            "node1",
            main_configs=["configs/config.d/named_collections.xml"],
            user_configs=["configs/users.d/users.xml"],
            with_minio=True,
        )

        logging.info("Starting cluster...")
        cluster.start()

        prepare_s3_bucket(cluster)
        logging.info("S3 bucket created")

        cluster.spark_session = get_spark()

        yield cluster

    finally:
        cluster.shutdown()


def run_query(instance, query, stdin=None, settings=None):
    # type: (ClickHouseInstance, str, object, dict) -> str

    logging.info("Running query '{}'...".format(query))
    result = instance.query(query, stdin=stdin, settings=settings)
    logging.info("Query finished")

    return result


def write_iceberg_from_file(
    spark, path, table_name, mode="overwrite", format_version="1", partition_by=None
):
    if mode == "overwrite":
        if partition_by is None:
            spark.read.load(f"file://{path}").writeTo(table_name).tableProperty(
                "format-version", format_version
            ).using("iceberg").create()
        else:
            spark.read.load(f"file://{path}").writeTo(table_name).partitionedBy(
                partition_by
            ).tableProperty("format-version", format_version).using("iceberg").create()
    else:
        spark.read.load(f"file://{path}").writeTo(table_name).append()


def write_iceberg_from_df(
    spark, df, table_name, mode="overwrite", format_version="1", partition_by=None
):
    if mode == "overwrite":
        if partition_by is None:
            df.writeTo(table_name).tableProperty(
                "format-version", format_version
            ).using("iceberg").create()
        else:
            df.writeTo(table_name).tableProperty(
                "format-version", format_version
            ).partitionedBy(partition_by).using("iceberg").create()
    else:
        df.writeTo(table_name).append()


def generate_data(spark, start, end):
    a = spark.range(start, end, 1).toDF("a")
    b = spark.range(start + 1, end + 1, 1).toDF("b")
    b = b.withColumn("b", b["b"].cast(StringType()))

    a = a.withColumn(
        "row_index", row_number().over(Window.orderBy(monotonically_increasing_id()))
    )
    b = b.withColumn(
        "row_index", row_number().over(Window.orderBy(monotonically_increasing_id()))
    )

    df = a.join(b, on=["row_index"]).drop("row_index")
    return df


def create_iceberg_table(node, table_name, format="Parquet"):
    node.query(
        f"""
        DROP TABLE IF EXISTS {table_name};
        CREATE TABLE {table_name}
        ENGINE=Iceberg(s3, filename = 'iceberg_data/default/{table_name}/', format={format})"""
    )


def create_initial_data_file(
    cluster, node, query, table_name, compression_method="none"
):
    node.query(
        f"""
        INSERT INTO TABLE FUNCTION
            file('{table_name}.parquet')
        SETTINGS
            output_format_parquet_compression_method='{compression_method}',
            s3_truncate_on_insert=1 {query}
        FORMAT Parquet"""
    )
    user_files_path = os.path.join(
        SCRIPT_DIR, f"{cluster.instances_dir_name}/node1/database/user_files"
    )
    result_path = f"{user_files_path}/{table_name}.parquet"
    return result_path


@pytest.mark.parametrize("format_version", ["1", "2"])
def test_single_iceberg_file(started_cluster, format_version):
    instance = started_cluster.instances["node1"]
    spark = started_cluster.spark_session
    minio_client = started_cluster.minio_client
    bucket = started_cluster.minio_bucket
    TABLE_NAME = "test_single_iceberg_file_" + format_version

    inserted_data = "SELECT number, toString(number) as string FROM numbers(100)"
    parquet_data_path = create_initial_data_file(
        started_cluster, instance, inserted_data, TABLE_NAME
    )

    write_iceberg_from_file(
        spark, parquet_data_path, TABLE_NAME, format_version=format_version
    )

    files = upload_directory(
        minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""
    )

    create_iceberg_table(instance, TABLE_NAME)
    assert instance.query(f"SELECT * FROM {TABLE_NAME}") == instance.query(
        inserted_data
    )


@pytest.mark.parametrize("format_version", ["1", "2"])
def test_partition_by(started_cluster, format_version):
    instance = started_cluster.instances["node1"]
    spark = started_cluster.spark_session
    minio_client = started_cluster.minio_client
    bucket = started_cluster.minio_bucket
    TABLE_NAME = "test_partition_by_" + format_version

    write_iceberg_from_df(
        spark,
        generate_data(spark, 0, 10),
        TABLE_NAME,
        mode="overwrite",
        format_version=format_version,
        partition_by="a",
    )

    files = upload_directory(
        minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""
    )
    assert len(files) == 14  # 10 partitiions + 4 metadata files

    create_iceberg_table(instance, TABLE_NAME)
    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 10


@pytest.mark.parametrize("format_version", ["1", "2"])
def test_multiple_iceberg_files(started_cluster, format_version):
    instance = started_cluster.instances["node1"]
    spark = started_cluster.spark_session
    minio_client = started_cluster.minio_client
    bucket = started_cluster.minio_bucket
    TABLE_NAME = "test_multiple_iceberg_files_" + format_version

    write_iceberg_from_df(
        spark,
        generate_data(spark, 0, 100),
        TABLE_NAME,
        mode="overwrite",
        format_version=format_version,
    )

    files = upload_directory(
        minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}", ""
    )
    # ['/iceberg_data/default/test_multiple_iceberg_files/data/00000-1-35302d56-f1ed-494e-a85b-fbf85c05ab39-00001.parquet',
    # '/iceberg_data/default/test_multiple_iceberg_files/metadata/version-hint.text',
    # '/iceberg_data/default/test_multiple_iceberg_files/metadata/3127466b-299d-48ca-a367-6b9b1df1e78c-m0.avro',
    # '/iceberg_data/default/test_multiple_iceberg_files/metadata/snap-5220855582621066285-1-3127466b-299d-48ca-a367-6b9b1df1e78c.avro',
    # '/iceberg_data/default/test_multiple_iceberg_files/metadata/v1.metadata.json']
    assert len(files) == 5

    create_iceberg_table(instance, TABLE_NAME)
    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100

    write_iceberg_from_df(
        spark,
        generate_data(spark, 100, 200),
        TABLE_NAME,
        mode="append",
        format_version=format_version,
    )
    files = upload_directory(
        minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}", ""
    )
    assert len(files) == 9

    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 200
    assert instance.query(f"SELECT * FROM {TABLE_NAME} ORDER BY 1") == instance.query(
        "SELECT number, toString(number + 1) FROM numbers(200)"
    )


@pytest.mark.parametrize("format_version", ["1", "2"])
def test_types(started_cluster, format_version):
    instance = started_cluster.instances["node1"]
    spark = started_cluster.spark_session
    minio_client = started_cluster.minio_client
    bucket = started_cluster.minio_bucket
    TABLE_NAME = "test_types_" + format_version

    data = [
        (
            123,
            "string",
            datetime.strptime("2000-01-01", "%Y-%m-%d"),
            ["str1", "str2"],
            True,
        )
    ]
    schema = StructType(
        [
            StructField("a", IntegerType()),
            StructField("b", StringType()),
            StructField("c", DateType()),
            StructField("d", ArrayType(StringType())),
            StructField("e", BooleanType()),
        ]
    )
    df = spark.createDataFrame(data=data, schema=schema)
    df.printSchema()
    write_iceberg_from_df(
        spark, df, TABLE_NAME, mode="overwrite", format_version=format_version
    )

    upload_directory(minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}", "")

    create_iceberg_table(instance, TABLE_NAME)
    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 1
    assert (
        instance.query(f"SELECT a, b, c, d, e FROM {TABLE_NAME}").strip()
        == "123\tstring\t2000-01-01\t['str1','str2']\ttrue"
    )

    table_function = f"iceberg(s3, filename='iceberg_data/default/{TABLE_NAME}/')"
    assert (
        instance.query(f"SELECT a, b, c, d, e FROM {table_function}").strip()
        == "123\tstring\t2000-01-01\t['str1','str2']\ttrue"
    )

    assert instance.query(f"DESCRIBE {table_function} FORMAT TSV") == TSV(
        [
            ["a", "Nullable(Int32)"],
            ["b", "Nullable(String)"],
            ["c", "Nullable(Date)"],
            ["d", "Array(Nullable(String))"],
            ["e", "Nullable(Bool)"],
        ]
    )


@pytest.mark.parametrize("format_version", ["1", "2"])
def test_delete_files(started_cluster, format_version):
    instance = started_cluster.instances["node1"]
    spark = started_cluster.spark_session
    minio_client = started_cluster.minio_client
    bucket = started_cluster.minio_bucket
    TABLE_NAME = "test_delete_files_" + format_version

    write_iceberg_from_df(
        spark,
        generate_data(spark, 0, 100),
        TABLE_NAME,
        mode="overwrite",
        format_version=format_version,
    )

    files = upload_directory(
        minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""
    )

    create_iceberg_table(instance, TABLE_NAME)

    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100

    spark.sql(f"DELETE FROM {TABLE_NAME} WHERE a >= 0")
    files = upload_directory(
        minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""
    )

    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 0
    assert instance.contains_in_log("Processing delete file for path")

    write_iceberg_from_df(
        spark,
        generate_data(spark, 100, 200),
        TABLE_NAME,
        mode="upsert",
        format_version=format_version,
    )

    files = upload_directory(
        minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""
    )

    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100

    spark.sql(f"DELETE FROM {TABLE_NAME} WHERE a >= 150")
    files = upload_directory(
        minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""
    )

    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 50


@pytest.mark.parametrize("format_version", ["1", "2"])
def test_evolved_schema(started_cluster, format_version):
    instance = started_cluster.instances["node1"]
    spark = started_cluster.spark_session
    minio_client = started_cluster.minio_client
    bucket = started_cluster.minio_bucket
    TABLE_NAME = "test_evolved_schema_" + format_version

    write_iceberg_from_df(
        spark,
        generate_data(spark, 0, 100),
        TABLE_NAME,
        mode="overwrite",
        format_version=format_version,
    )

    files = upload_directory(
        minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""
    )

    create_iceberg_table(instance, TABLE_NAME)

    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100

    spark.sql(f"ALTER TABLE {TABLE_NAME} ADD COLUMNS (x bigint)")
    files = upload_directory(
        minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""
    )

    error = instance.query_and_get_error(f"SELECT * FROM {TABLE_NAME}")
    assert "UNSUPPORTED_METHOD" in error

def test_row_based_deletes(started_cluster):
    instance = started_cluster.instances["node1"]
    spark = started_cluster.spark_session
    minio_client = started_cluster.minio_client
    bucket = started_cluster.minio_bucket
    TABLE_NAME = "test_row_based_deletes"

    spark.sql(
        f"CREATE TABLE {TABLE_NAME} (id bigint, data string) USING iceberg TBLPROPERTIES ('format-version' = '2', 'write.update.mode'='merge-on-read', 'write.delete.mode'='merge-on-read', 'write.merge.mode'='merge-on-read')"
    )
    spark.sql(f"INSERT INTO {TABLE_NAME} select id, char(id + ascii('a')) from range(100)")

    files = upload_directory(
        minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""
    )

    create_iceberg_table(instance, TABLE_NAME)

    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100

    spark.sql(f"DELETE FROM {TABLE_NAME} WHERE id < 10")
    files = upload_directory(
        minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""
    )

    error = instance.query_and_get_error(f"SELECT * FROM {TABLE_NAME}")
    assert "UNSUPPORTED_METHOD" in error

@pytest.mark.parametrize("format_version", ["1", "2"])
def test_schema_inference(started_cluster, format_version):
    instance = started_cluster.instances["node1"]
    spark = started_cluster.spark_session
    minio_client = started_cluster.minio_client
    bucket = started_cluster.minio_bucket
    for format in ["Parquet", "ORC", "Avro"]:
        TABLE_NAME = "test_schema_inference_" + format + "_" + format_version

        # Types time, timestamptz, fixed are not supported in Spark.
        spark.sql(
            f"CREATE TABLE {TABLE_NAME} (intC int, longC long, floatC float, doubleC double, decimalC1 decimal(10, 3), decimalC2 decimal(20, 10), decimalC3 decimal(38, 30), dateC date,  timestampC timestamp, stringC string, binaryC binary, arrayC1 array<int>, mapC1 map<string, string>, structC1 struct<field1: int, field2: string>, complexC array<struct<field1: map<string, array<map<string, int>>>, field2: struct<field3: int, field4: string>>>) USING iceberg TBLPROPERTIES ('format-version' = '{format_version}', 'write.format.default' = '{format}')"
        )

        spark.sql(
            f"insert into {TABLE_NAME} select 42, 4242, 42.42, 4242.4242, decimal(42.42), decimal(42.42), decimal(42.42), date('2020-01-01'), timestamp('2020-01-01 20:00:00'), 'hello', binary('hello'), array(1,2,3), map('key', 'value'), struct(42, 'hello'), array(struct(map('key', array(map('key', 42))), struct(42, 'hello')))"
        )

        files = upload_directory(
            minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""
        )

        create_iceberg_table(instance, TABLE_NAME, format)

        res = instance.query(f"DESC {TABLE_NAME} FORMAT TSVRaw")
        expected = TSV(
            [
                ["intC", "Nullable(Int32)"],
                ["longC", "Nullable(Int64)"],
                ["floatC", "Nullable(Float32)"],
                ["doubleC", "Nullable(Float64)"],
                ["decimalC1", "Nullable(Decimal(10, 3))"],
                ["decimalC2", "Nullable(Decimal(20, 10))"],
                ["decimalC3", "Nullable(Decimal(38, 30))"],
                ["dateC", "Nullable(Date)"],
                ["timestampC", "Nullable(DateTime64(6, \'UTC\'))"],
                ["stringC", "Nullable(String)"],
                ["binaryC", "Nullable(String)"],
                ["arrayC1", "Array(Nullable(Int32))"],
                ["mapC1", "Map(String, Nullable(String))"],
                ["structC1", "Tuple(field1 Nullable(Int32), field2 Nullable(String))"],
                ["complexC", "Array(Tuple(field1 Map(String, Array(Map(String, Nullable(Int32)))), field2 Tuple(field3 Nullable(Int32), field4 Nullable(String))))"],
            ]
        )

        assert res == expected

        # Check that we can parse data
        instance.query(f"SELECT * FROM {TABLE_NAME}")


@pytest.mark.parametrize("format_version", ["1", "2"])
def test_metadata_file_selection(started_cluster, format_version):
    instance = started_cluster.instances["node1"]
    spark = started_cluster.spark_session
    minio_client = started_cluster.minio_client
    bucket = started_cluster.minio_bucket
    TABLE_NAME = "test_metadata_selection_" + format_version

    spark.sql(
        f"CREATE TABLE {TABLE_NAME} (id bigint, data string) USING iceberg TBLPROPERTIES ('format-version' = '2', 'write.update.mode'='merge-on-read', 'write.delete.mode'='merge-on-read', 'write.merge.mode'='merge-on-read')"
    )

    for i in range(50):
        spark.sql(f"INSERT INTO {TABLE_NAME} select id, char(id + ascii('a')) from range(10)")

    files = upload_directory(
        minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""
    )

    create_iceberg_table(instance, TABLE_NAME)

    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 500
fix and add test 2023-01-18 08:33:55 +00:00			`import helpers.client`
			`from helpers.cluster import ClickHouseCluster`
			`from helpers.test_tools import TSV`

Add spark to tests, rewrite tests, fix bug 2023-03-07 15:04:21 +00:00			`import pyspark`
			`import logging`
			`import os`
			`import json`
			`import pytest`
			`import time`
Try to understand why some tests fail in CI, but locally pass 2023-04-05 14:28:27 +00:00			`import glob`
fix and add test 2023-01-18 08:33:55 +00:00
Better 2023-04-03 14:57:49 +00:00			`from pyspark.sql.types import (`
			`StructType,`
			`StructField,`
			`StringType,`
			`IntegerType,`
			`DateType,`
			`TimestampType,`
			`BooleanType,`
			`ArrayType,`
			`)`
			`from pyspark.sql.functions import current_timestamp`
			`from datetime import datetime`
			`from pyspark.sql.functions import monotonically_increasing_id, row_number`
			`from pyspark.sql.window import Window`
			`from pyspark.sql.readwriter import DataFrameWriter, DataFrameWriterV2`

Add spark to tests, rewrite tests, fix bug 2023-03-07 15:04:21 +00:00			`from helpers.s3_tools import prepare_s3_bucket, upload_directory, get_file_contents`
fix and add test 2023-01-18 08:33:55 +00:00
Add spark to tests, rewrite tests, fix bug 2023-03-07 15:04:21 +00:00			`SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))`
fix and add test 2023-01-18 08:33:55 +00:00

Close spark session 2023-04-11 15:23:05 +00:00			`def get_spark():`
			`builder = (`
			`pyspark.sql.SparkSession.builder.appName("spark_test")`
			`.config(`
			`"spark.sql.catalog.spark_catalog",`
			`"org.apache.iceberg.spark.SparkSessionCatalog",`
			`)`
			`.config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog")`
			`.config("spark.sql.catalog.spark_catalog.type", "hadoop")`
			`.config("spark.sql.catalog.spark_catalog.warehouse", "/iceberg_data")`
Add tests 2023-10-17 18:10:47 +00:00			`.config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")`
Close spark session 2023-04-11 15:23:05 +00:00			`.master("local")`
			`)`
			`return builder.master("local").getOrCreate()`


fix and add test 2023-01-18 08:33:55 +00:00			`@pytest.fixture(scope="module")`
			`def started_cluster():`
			`try:`
Better 2023-04-13 13:10:49 +00:00			`cluster = ClickHouseCluster(__file__, with_spark=True)`
Better 2023-03-30 21:09:12 +00:00			`cluster.add_instance(`
			`"node1",`
			`main_configs=["configs/config.d/named_collections.xml"],`
Fix tests 2023-06-14 11:45:53 +00:00			`user_configs=["configs/users.d/users.xml"],`
Better 2023-03-30 21:09:12 +00:00			`with_minio=True,`
			`)`
fix and add test 2023-01-18 08:33:55 +00:00
			`logging.info("Starting cluster...")`
			`cluster.start()`

			`prepare_s3_bucket(cluster)`
			`logging.info("S3 bucket created")`

Close spark session 2023-04-11 15:23:05 +00:00			`cluster.spark_session = get_spark()`

fix and add test 2023-01-18 08:33:55 +00:00			`yield cluster`

			`finally:`
			`cluster.shutdown()`


			`def run_query(instance, query, stdin=None, settings=None):`
			`# type: (ClickHouseInstance, str, object, dict) -> str`

			`logging.info("Running query '{}'...".format(query))`
			`result = instance.query(query, stdin=stdin, settings=settings)`
			`logging.info("Query finished")`

			`return result`


Better 2023-04-03 14:57:49 +00:00			`def write_iceberg_from_file(`
Better 2023-04-03 18:56:10 +00:00			`spark, path, table_name, mode="overwrite", format_version="1", partition_by=None`
Better 2023-04-03 14:57:49 +00:00			`):`
			`if mode == "overwrite":`
Better 2023-04-03 18:56:10 +00:00			`if partition_by is None:`
			`spark.read.load(f"file://{path}").writeTo(table_name).tableProperty(`
			`"format-version", format_version`
			`).using("iceberg").create()`
			`else:`
			`spark.read.load(f"file://{path}").writeTo(table_name).partitionedBy(`
			`partition_by`
			`).tableProperty("format-version", format_version).using("iceberg").create()`
Better 2023-04-03 14:57:49 +00:00			`else:`
			`spark.read.load(f"file://{path}").writeTo(table_name).append()`


Better 2023-04-03 18:56:10 +00:00			`def write_iceberg_from_df(`
			`spark, df, table_name, mode="overwrite", format_version="1", partition_by=None`
			`):`
Better 2023-04-03 14:57:49 +00:00			`if mode == "overwrite":`
Better 2023-04-03 18:56:10 +00:00			`if partition_by is None:`
			`df.writeTo(table_name).tableProperty(`
			`"format-version", format_version`
			`).using("iceberg").create()`
			`else:`
			`df.writeTo(table_name).tableProperty(`
			`"format-version", format_version`
			`).partitionedBy(partition_by).using("iceberg").create()`
Better 2023-04-03 14:57:49 +00:00			`else:`
			`df.writeTo(table_name).append()`


			`def generate_data(spark, start, end):`
			`a = spark.range(start, end, 1).toDF("a")`
			`b = spark.range(start + 1, end + 1, 1).toDF("b")`
			`b = b.withColumn("b", b["b"].cast(StringType()))`
Better 2023-03-30 21:09:12 +00:00
Better 2023-04-03 14:57:49 +00:00			`a = a.withColumn(`
			`"row_index", row_number().over(Window.orderBy(monotonically_increasing_id()))`
			`)`
			`b = b.withColumn(`
			`"row_index", row_number().over(Window.orderBy(monotonically_increasing_id()))`
			`)`
Better 2023-03-30 21:09:12 +00:00
Better 2023-04-03 14:57:49 +00:00			`df = a.join(b, on=["row_index"]).drop("row_index")`
			`return df`
fix and add test 2023-01-18 08:33:55 +00:00
Add spark to tests, rewrite tests, fix bug 2023-03-07 15:04:21 +00:00
Add tests 2023-10-17 18:10:47 +00:00			`def create_iceberg_table(node, table_name, format="Parquet"):`
Better 2023-03-30 21:09:12 +00:00			`node.query(`
			`f"""`
			`DROP TABLE IF EXISTS {table_name};`
			`CREATE TABLE {table_name}`
Add tests 2023-10-17 18:10:47 +00:00			`ENGINE=Iceberg(s3, filename = 'iceberg_data/default/{table_name}/', format={format})"""`
Add spark to tests, rewrite tests, fix bug 2023-03-07 15:04:21 +00:00			`)`
fix and add test 2023-01-18 08:33:55 +00:00

Fix 2023-04-05 18:32:37 +00:00			`def create_initial_data_file(`
			`cluster, node, query, table_name, compression_method="none"`
			`):`
Better 2023-03-30 21:09:12 +00:00			`node.query(`
			`f"""`
			`INSERT INTO TABLE FUNCTION`
			`file('{table_name}.parquet')`
			`SETTINGS`
			`output_format_parquet_compression_method='{compression_method}',`
			`s3_truncate_on_insert=1 {query}`
			`FORMAT Parquet"""`
			`)`
Fix 2023-04-05 18:32:37 +00:00			`user_files_path = os.path.join(`
			`SCRIPT_DIR, f"{cluster.instances_dir_name}/node1/database/user_files"`
			`)`
			`result_path = f"{user_files_path}/{table_name}.parquet"`
Better 2023-03-30 21:09:12 +00:00			`return result_path`

Add spark to tests, rewrite tests, fix bug 2023-03-07 15:04:21 +00:00
Better 2023-04-03 14:57:49 +00:00			`@pytest.mark.parametrize("format_version", ["1", "2"])`
			`def test_single_iceberg_file(started_cluster, format_version):`
Better 2023-03-30 21:09:12 +00:00			`instance = started_cluster.instances["node1"]`
Close spark session 2023-04-11 15:23:05 +00:00			`spark = started_cluster.spark_session`
Add spark to tests, rewrite tests, fix bug 2023-03-07 15:04:21 +00:00			`minio_client = started_cluster.minio_client`
fix and add test 2023-01-18 08:33:55 +00:00			`bucket = started_cluster.minio_bucket`
Better 2023-04-03 14:57:49 +00:00			`TABLE_NAME = "test_single_iceberg_file_" + format_version`
fix and add test 2023-01-18 08:33:55 +00:00
Add tests 2023-10-17 18:10:47 +00:00			`inserted_data = "SELECT number, toString(number) as string FROM numbers(100)"`
Fix 2023-04-05 18:32:37 +00:00			`parquet_data_path = create_initial_data_file(`
			`started_cluster, instance, inserted_data, TABLE_NAME`
			`)`

Better 2023-04-03 14:57:49 +00:00			`write_iceberg_from_file(`
			`spark, parquet_data_path, TABLE_NAME, format_version=format_version`
			`)`
Better 2023-03-30 21:09:12 +00:00
			`files = upload_directory(`
			`minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""`
Add spark to tests, rewrite tests, fix bug 2023-03-07 15:04:21 +00:00			`)`
Better 2023-03-30 21:09:12 +00:00
			`create_iceberg_table(instance, TABLE_NAME)`
Add spark to tests, rewrite tests, fix bug 2023-03-07 15:04:21 +00:00			`assert instance.query(f"SELECT * FROM {TABLE_NAME}") == instance.query(`
			`inserted_data`
fix and add test 2023-01-18 08:33:55 +00:00			`)`
Better 2023-03-30 21:09:12 +00:00

Better 2023-04-03 18:56:10 +00:00			`@pytest.mark.parametrize("format_version", ["1", "2"])`
			`def test_partition_by(started_cluster, format_version):`
			`instance = started_cluster.instances["node1"]`
Close spark session 2023-04-11 15:23:05 +00:00			`spark = started_cluster.spark_session`
Better 2023-04-03 18:56:10 +00:00			`minio_client = started_cluster.minio_client`
			`bucket = started_cluster.minio_bucket`
			`TABLE_NAME = "test_partition_by_" + format_version`

			`write_iceberg_from_df(`
			`spark,`
			`generate_data(spark, 0, 10),`
			`TABLE_NAME,`
			`mode="overwrite",`
			`format_version=format_version,`
			`partition_by="a",`
			`)`

			`files = upload_directory(`
			`minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""`
			`)`
			`assert len(files) == 14 # 10 partitiions + 4 metadata files`

			`create_iceberg_table(instance, TABLE_NAME)`
			`assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 10`


Better 2023-04-03 14:57:49 +00:00			`@pytest.mark.parametrize("format_version", ["1", "2"])`
			`def test_multiple_iceberg_files(started_cluster, format_version):`
			`instance = started_cluster.instances["node1"]`
Close spark session 2023-04-11 15:23:05 +00:00			`spark = started_cluster.spark_session`
Better 2023-04-03 14:57:49 +00:00			`minio_client = started_cluster.minio_client`
			`bucket = started_cluster.minio_bucket`
			`TABLE_NAME = "test_multiple_iceberg_files_" + format_version`

			`write_iceberg_from_df(`
Better 2023-04-03 18:56:10 +00:00			`spark,`
			`generate_data(spark, 0, 100),`
			`TABLE_NAME,`
			`mode="overwrite",`
			`format_version=format_version,`
Better 2023-04-03 14:57:49 +00:00			`)`

			`files = upload_directory(`
			`minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}", ""`
			`)`
			`# ['/iceberg_data/default/test_multiple_iceberg_files/data/00000-1-35302d56-f1ed-494e-a85b-fbf85c05ab39-00001.parquet',`
			`# '/iceberg_data/default/test_multiple_iceberg_files/metadata/version-hint.text',`
			`# '/iceberg_data/default/test_multiple_iceberg_files/metadata/3127466b-299d-48ca-a367-6b9b1df1e78c-m0.avro',`
			`# '/iceberg_data/default/test_multiple_iceberg_files/metadata/snap-5220855582621066285-1-3127466b-299d-48ca-a367-6b9b1df1e78c.avro',`
			`# '/iceberg_data/default/test_multiple_iceberg_files/metadata/v1.metadata.json']`
			`assert len(files) == 5`

			`create_iceberg_table(instance, TABLE_NAME)`
			`assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100`

			`write_iceberg_from_df(`
Better 2023-04-03 18:56:10 +00:00			`spark,`
			`generate_data(spark, 100, 200),`
			`TABLE_NAME,`
			`mode="append",`
			`format_version=format_version,`
Better 2023-04-03 14:57:49 +00:00			`)`
			`files = upload_directory(`
			`minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}", ""`
			`)`
			`assert len(files) == 9`

			`assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 200`
			`assert instance.query(f"SELECT * FROM {TABLE_NAME} ORDER BY 1") == instance.query(`
			`"SELECT number, toString(number + 1) FROM numbers(200)"`
			`)`


			`@pytest.mark.parametrize("format_version", ["1", "2"])`
			`def test_types(started_cluster, format_version):`
Better 2023-03-30 21:09:12 +00:00			`instance = started_cluster.instances["node1"]`
Close spark session 2023-04-11 15:23:05 +00:00			`spark = started_cluster.spark_session`
Better 2023-03-30 21:09:12 +00:00			`minio_client = started_cluster.minio_client`
			`bucket = started_cluster.minio_bucket`
Better 2023-04-03 14:57:49 +00:00			`TABLE_NAME = "test_types_" + format_version`

			`data = [`
			`(`
			`123,`
			`"string",`
			`datetime.strptime("2000-01-01", "%Y-%m-%d"),`
			`["str1", "str2"],`
			`True,`
			`)`
			`]`
			`schema = StructType(`
			`[`
			`StructField("a", IntegerType()),`
			`StructField("b", StringType()),`
			`StructField("c", DateType()),`
			`StructField("d", ArrayType(StringType())),`
			`StructField("e", BooleanType()),`
			`]`
			`)`
			`df = spark.createDataFrame(data=data, schema=schema)`
			`df.printSchema()`
Better 2023-04-03 18:56:10 +00:00			`write_iceberg_from_df(`
			`spark, df, TABLE_NAME, mode="overwrite", format_version=format_version`
			`)`
Better 2023-04-03 14:57:49 +00:00
			`upload_directory(minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}", "")`

			`create_iceberg_table(instance, TABLE_NAME)`
			`assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 1`
			`assert (`
			`instance.query(f"SELECT a, b, c, d, e FROM {TABLE_NAME}").strip()`
			`== "123\tstring\t2000-01-01\t['str1','str2']\ttrue"`
			`)`

			`table_function = f"iceberg(s3, filename='iceberg_data/default/{TABLE_NAME}/')"`
			`assert (`
			`instance.query(f"SELECT a, b, c, d, e FROM {table_function}").strip()`
			`== "123\tstring\t2000-01-01\t['str1','str2']\ttrue"`
			`)`

			`assert instance.query(f"DESCRIBE {table_function} FORMAT TSV") == TSV(`
			`[`
			`["a", "Nullable(Int32)"],`
			`["b", "Nullable(String)"],`
Add tests 2023-10-17 18:10:47 +00:00			`["c", "Nullable(Date)"],`
Better 2023-04-03 14:57:49 +00:00			`["d", "Array(Nullable(String))"],`
			`["e", "Nullable(Bool)"],`
			`]`
			`)`
Add a test 2023-10-15 16:02:34 +00:00

			`@pytest.mark.parametrize("format_version", ["1", "2"])`
			`def test_delete_files(started_cluster, format_version):`
			`instance = started_cluster.instances["node1"]`
			`spark = started_cluster.spark_session`
			`minio_client = started_cluster.minio_client`
			`bucket = started_cluster.minio_bucket`
			`TABLE_NAME = "test_delete_files_" + format_version`

			`write_iceberg_from_df(`
			`spark,`
			`generate_data(spark, 0, 100),`
			`TABLE_NAME,`
			`mode="overwrite",`
			`format_version=format_version,`
			`)`

			`files = upload_directory(`
			`minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""`
			`)`

			`create_iceberg_table(instance, TABLE_NAME)`

			`assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100`

			`spark.sql(f"DELETE FROM {TABLE_NAME} WHERE a >= 0")`
			`files = upload_directory(`
			`minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""`
			`)`

			`assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 0`
Update test.py 2023-10-16 15:30:20 +00:00			`assert instance.contains_in_log("Processing delete file for path")`
Add a test 2023-10-15 16:02:34 +00:00
			`write_iceberg_from_df(`
			`spark,`
			`generate_data(spark, 100, 200),`
			`TABLE_NAME,`
			`mode="upsert",`
			`format_version=format_version,`
			`)`

			`files = upload_directory(`
			`minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""`
			`)`

			`assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100`

			`spark.sql(f"DELETE FROM {TABLE_NAME} WHERE a >= 150")`
			`files = upload_directory(`
			`minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""`
			`)`

			`assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 50`
Add tests 2023-10-17 18:10:47 +00:00

			`@pytest.mark.parametrize("format_version", ["1", "2"])`
			`def test_evolved_schema(started_cluster, format_version):`
			`instance = started_cluster.instances["node1"]`
			`spark = started_cluster.spark_session`
			`minio_client = started_cluster.minio_client`
			`bucket = started_cluster.minio_bucket`
			`TABLE_NAME = "test_evolved_schema_" + format_version`

			`write_iceberg_from_df(`
			`spark,`
			`generate_data(spark, 0, 100),`
			`TABLE_NAME,`
			`mode="overwrite",`
			`format_version=format_version,`
			`)`

			`files = upload_directory(`
			`minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""`
			`)`

			`create_iceberg_table(instance, TABLE_NAME)`

			`assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100`

			`spark.sql(f"ALTER TABLE {TABLE_NAME} ADD COLUMNS (x bigint)")`
			`files = upload_directory(`
			`minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""`
			`)`

			`error = instance.query_and_get_error(f"SELECT * FROM {TABLE_NAME}")`
			`assert "UNSUPPORTED_METHOD" in error`

			`def test_row_based_deletes(started_cluster):`
			`instance = started_cluster.instances["node1"]`
			`spark = started_cluster.spark_session`
			`minio_client = started_cluster.minio_client`
			`bucket = started_cluster.minio_bucket`
			`TABLE_NAME = "test_row_based_deletes"`

			`spark.sql(`
			`f"CREATE TABLE {TABLE_NAME} (id bigint, data string) USING iceberg TBLPROPERTIES ('format-version' = '2', 'write.update.mode'='merge-on-read', 'write.delete.mode'='merge-on-read', 'write.merge.mode'='merge-on-read')"`
			`)`
			`spark.sql(f"INSERT INTO {TABLE_NAME} select id, char(id + ascii('a')) from range(100)")`

			`files = upload_directory(`
			`minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""`
			`)`

			`create_iceberg_table(instance, TABLE_NAME)`

			`assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100`

			`spark.sql(f"DELETE FROM {TABLE_NAME} WHERE id < 10")`
			`files = upload_directory(`
			`minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""`
			`)`

			`error = instance.query_and_get_error(f"SELECT * FROM {TABLE_NAME}")`
			`assert "UNSUPPORTED_METHOD" in error`

			`@pytest.mark.parametrize("format_version", ["1", "2"])`
			`def test_schema_inference(started_cluster, format_version):`
			`instance = started_cluster.instances["node1"]`
			`spark = started_cluster.spark_session`
			`minio_client = started_cluster.minio_client`
			`bucket = started_cluster.minio_bucket`
			`for format in ["Parquet", "ORC", "Avro"]:`
			`TABLE_NAME = "test_schema_inference_" + format + "_" + format_version`

			`# Types time, timestamptz, fixed are not supported in Spark.`
			`spark.sql(`
			f"CREATE TABLE {TABLE_NAME} (intC int, longC long, floatC float, doubleC double, decimalC1 decimal(10, 3), decimalC2 decimal(20, 10), decimalC3 decimal(38, 30), dateC date, timestampC timestamp, stringC string, binaryC binary, arrayC1 array<int>, mapC1 map<string, string>, structC1 struct<field1: int, field2: string>, complexC array<struct<field1: map<string, array<map<string, int>>>, field2: struct<field3: int, field4: string>>>) USING iceberg TBLPROPERTIES ('format-version' = '{format_version}', 'write.format.default' = '{format}')"
			`)`

			`spark.sql(`
			`f"insert into {TABLE_NAME} select 42, 4242, 42.42, 4242.4242, decimal(42.42), decimal(42.42), decimal(42.42), date('2020-01-01'), timestamp('2020-01-01 20:00:00'), 'hello', binary('hello'), array(1,2,3), map('key', 'value'), struct(42, 'hello'), array(struct(map('key', array(map('key', 42))), struct(42, 'hello')))"`
			`)`

			`files = upload_directory(`
			`minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""`
			`)`

			`create_iceberg_table(instance, TABLE_NAME, format)`

			`res = instance.query(f"DESC {TABLE_NAME} FORMAT TSVRaw")`
			`expected = TSV(`
			`[`
			`["intC", "Nullable(Int32)"],`
			`["longC", "Nullable(Int64)"],`
			`["floatC", "Nullable(Float32)"],`
			`["doubleC", "Nullable(Float64)"],`
			`["decimalC1", "Nullable(Decimal(10, 3))"],`
			`["decimalC2", "Nullable(Decimal(20, 10))"],`
			`["decimalC3", "Nullable(Decimal(38, 30))"],`
			`["dateC", "Nullable(Date)"],`
			`["timestampC", "Nullable(DateTime64(6, \'UTC\'))"],`
			`["stringC", "Nullable(String)"],`
			`["binaryC", "Nullable(String)"],`
			`["arrayC1", "Array(Nullable(Int32))"],`
			`["mapC1", "Map(String, Nullable(String))"],`
			`["structC1", "Tuple(field1 Nullable(Int32), field2 Nullable(String))"],`
			`["complexC", "Array(Tuple(field1 Map(String, Array(Map(String, Nullable(Int32)))), field2 Tuple(field3 Nullable(Int32), field4 Nullable(String))))"],`
			`]`
			`)`

			`assert res == expected`

			`# Check that we can parse data`
			`instance.query(f"SELECT * FROM {TABLE_NAME}")`


			`@pytest.mark.parametrize("format_version", ["1", "2"])`
			`def test_metadata_file_selection(started_cluster, format_version):`
			`instance = started_cluster.instances["node1"]`
			`spark = started_cluster.spark_session`
			`minio_client = started_cluster.minio_client`
			`bucket = started_cluster.minio_bucket`
			`TABLE_NAME = "test_metadata_selection_" + format_version`

			`spark.sql(`
			`f"CREATE TABLE {TABLE_NAME} (id bigint, data string) USING iceberg TBLPROPERTIES ('format-version' = '2', 'write.update.mode'='merge-on-read', 'write.delete.mode'='merge-on-read', 'write.merge.mode'='merge-on-read')"`
			`)`

			`for i in range(50):`
			`spark.sql(f"INSERT INTO {TABLE_NAME} select id, char(id + ascii('a')) from range(10)")`

			`files = upload_directory(`
			`minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""`
			`)`

			`create_iceberg_table(instance, TABLE_NAME)`

			`assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 500`