ClickHouse/tests/integration/test_storage_hudi/test.py

import logging
import pytest
import os
import json

import helpers.client
from helpers.cluster import ClickHouseCluster, ClickHouseInstance
from helpers.test_tools import TSV
from helpers.s3_tools import prepare_s3_bucket, upload_directory, get_file_contents

import pyspark
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    DateType,
    TimestampType,
    BooleanType,
    ArrayType,
)
from pyspark.sql.functions import current_timestamp
from datetime import datetime
from pyspark.sql.functions import monotonically_increasing_id, row_number
from pyspark.sql.window import Window


SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))


def get_spark():
    builder = (
        pyspark.sql.SparkSession.builder.appName("spark_test")
        .config(
            "org.apache.spark.sql.hudi.catalog.HoodieCatalog",
        )
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
        .config(
            "spark.sql.catalog.local", "org.apache.spark.sql.hudi.catalog.HoodieCatalog"
        )
        .config("spark.driver.memory", "20g")
        .master("local")
    )
    return builder.getOrCreate()


@pytest.fixture(scope="module")
def started_cluster():
    cluster = ClickHouseCluster(__file__, with_spark=True)
    try:
        cluster.add_instance(
            "node1",
            main_configs=["configs/config.d/named_collections.xml"],
            user_configs=["configs/users.d/users.xml"],
            with_minio=True,
        )

        logging.info("Starting cluster...")
        cluster.start()

        prepare_s3_bucket(cluster)
        logging.info("S3 bucket created")

        cluster.spark_session = get_spark()

        yield cluster

    finally:
        cluster.shutdown()


def run_query(instance, query, stdin=None, settings=None):
    # type: (ClickHouseInstance, str, object, dict) -> str

    logging.info("Running query '{}'...".format(query))
    result = instance.query(query, stdin=stdin, settings=settings)
    logging.info("Query finished")

    return result


def write_hudi_from_df(spark, table_name, df, result_path, mode="overwrite"):
    if mode == "overwrite":
        hudi_write_mode = "insert_overwrite"
    else:
        hudi_write_mode = "upsert"

    df.write.mode(mode).option("compression", "none").option(
        "compression", "none"
    ).format("hudi").option("hoodie.table.name", table_name).option(
        "hoodie.datasource.write.partitionpath.field", "partitionpath"
    ).option(
        "hoodie.datasource.write.table.name", table_name
    ).option(
        "hoodie.datasource.write.operation", hudi_write_mode
    ).option(
        "hoodie.datasource.compaction.async.enable", "true"
    ).option(
        "hoodie.compact.inline", "false"
    ).option(
        "hoodie.compact.inline.max.delta.commits", "10"
    ).option(
        "hoodie.parquet.compression.codec", "snappy"
    ).option(
        "hoodie.hfile.compression.algorithm", "uncompressed"
    ).option(
        "hoodie.datasource.write.recordkey.field", "a"
    ).option(
        "hoodie.datasource.write.precombine.field", "a"
    ).save(
        result_path
    )


def write_hudi_from_file(spark, table_name, path, result_path):
    spark.conf.set("spark.sql.debug.maxToStringFields", 100000)
    df = spark.read.load(f"file://{path}")
    write_hudi_from_df(spark, table_name, df, result_path)


def generate_data(spark, start, end, append=1):
    a = spark.range(start, end, 1).toDF("a")
    b = spark.range(start + append, end + append, 1).toDF("b")
    b = b.withColumn("b", b["b"].cast(StringType()))

    a = a.withColumn(
        "row_index", row_number().over(Window.orderBy(monotonically_increasing_id()))
    )
    b = b.withColumn(
        "row_index", row_number().over(Window.orderBy(monotonically_increasing_id()))
    )

    df = a.join(b, on=["row_index"]).drop("row_index")
    return df


def create_hudi_table(node, table_name):
    node.query(
        f"""
        DROP TABLE IF EXISTS {table_name};
        CREATE TABLE {table_name}
        ENGINE=Hudi(s3, filename = '{table_name}/')"""
    )


def create_initial_data_file(
    cluster, node, query, table_name, compression_method="none"
):
    node.query(
        f"""
        INSERT INTO TABLE FUNCTION
            file('{table_name}.parquet')
        SETTINGS
            output_format_parquet_compression_method='{compression_method}',
            s3_truncate_on_insert=1 {query}
        FORMAT Parquet"""
    )
    user_files_path = os.path.join(
        SCRIPT_DIR, f"{cluster.instances_dir_name}/node1/database/user_files"
    )
    result_path = f"{user_files_path}/{table_name}.parquet"
    return result_path


def test_single_hudi_file(started_cluster):
    instance = started_cluster.instances["node1"]
    spark = started_cluster.spark_session
    minio_client = started_cluster.minio_client
    bucket = started_cluster.minio_bucket
    TABLE_NAME = "test_single_hudi_file"

    inserted_data = "SELECT number as a, toString(number) as b FROM numbers(100)"
    parquet_data_path = create_initial_data_file(
        started_cluster, instance, inserted_data, TABLE_NAME
    )

    write_hudi_from_file(spark, TABLE_NAME, parquet_data_path, f"/{TABLE_NAME}")
    files = upload_directory(minio_client, bucket, f"/{TABLE_NAME}", "")
    assert len(files) == 1
    assert files[0].endswith(".parquet")

    create_hudi_table(instance, TABLE_NAME)
    assert instance.query(f"SELECT a, b FROM {TABLE_NAME}") == instance.query(
        inserted_data
    )


def test_multiple_hudi_files(started_cluster):
    instance = started_cluster.instances["node1"]
    spark = started_cluster.spark_session
    minio_client = started_cluster.minio_client
    bucket = started_cluster.minio_bucket
    TABLE_NAME = "test_multiple_hudi_files"

    write_hudi_from_df(
        spark, TABLE_NAME, generate_data(spark, 0, 100), f"/{TABLE_NAME}"
    )
    files = upload_directory(minio_client, bucket, f"/{TABLE_NAME}", "")
    assert len(files) == 1

    create_hudi_table(instance, TABLE_NAME)
    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100

    write_hudi_from_df(
        spark,
        TABLE_NAME,
        generate_data(spark, 100, 200),
        f"/{TABLE_NAME}",
        mode="append",
    )
    files = upload_directory(minio_client, bucket, f"/{TABLE_NAME}", "")
    assert len(files) == 2

    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 200
    assert instance.query(
        f"SELECT a, b FROM {TABLE_NAME} ORDER BY 1"
    ) == instance.query("SELECT number, toString(number + 1) FROM numbers(200)")

    write_hudi_from_df(
        spark,
        TABLE_NAME,
        generate_data(spark, 100, 300),
        f"/{TABLE_NAME}",
        mode="append",
    )
    files = upload_directory(minio_client, bucket, f"/{TABLE_NAME}", "")
    assert len(files) == 3

    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 300
    assert instance.query(
        f"SELECT a, b FROM {TABLE_NAME} ORDER BY 1"
    ) == instance.query("SELECT number, toString(number + 1) FROM numbers(300)")

    assert int(instance.query(f"SELECT b FROM {TABLE_NAME} WHERE a = 100")) == 101
    write_hudi_from_df(
        spark,
        TABLE_NAME,
        generate_data(spark, 100, 101, append=0),
        f"/{TABLE_NAME}",
        mode="append",
    )
    files = upload_directory(minio_client, bucket, f"/{TABLE_NAME}", "")

    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 300
    assert int(instance.query(f"SELECT b FROM {TABLE_NAME} WHERE a = 100")) == 100

    write_hudi_from_df(
        spark,
        TABLE_NAME,
        generate_data(spark, 100, 1000000, append=0),
        f"/{TABLE_NAME}",
        mode="append",
    )
    files = upload_directory(minio_client, bucket, f"/{TABLE_NAME}", "")
    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 1000000


def test_types(started_cluster):
    instance = started_cluster.instances["node1"]
    spark = started_cluster.spark_session
    minio_client = started_cluster.minio_client
    bucket = started_cluster.minio_bucket
    TABLE_NAME = "test_types"

    data = [
        (
            123,
            "string",
            datetime.strptime("2000-01-01", "%Y-%m-%d"),
            ["str1", "str2"],
            True,
        )
    ]
    schema = StructType(
        [
            StructField("a", IntegerType()),
            StructField("b", StringType()),
            StructField("c", DateType()),
            StructField("d", ArrayType(StringType())),
            StructField("e", BooleanType()),
        ]
    )
    df = spark.createDataFrame(data=data, schema=schema)
    df.printSchema()
    write_hudi_from_df(spark, TABLE_NAME, df, f"/{TABLE_NAME}", mode="overwrite")

    upload_directory(minio_client, bucket, f"/{TABLE_NAME}", "")

    create_hudi_table(instance, TABLE_NAME)
    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 1
    assert (
        instance.query(f"SELECT a, b, c, d, e FROM {TABLE_NAME}").strip()
        == "123\tstring\t2000-01-01\t['str1','str2']\ttrue"
    )

    table_function = f"hudi(s3, filename='{TABLE_NAME}/')"
    assert (
        instance.query(f"SELECT a, b, c, d, e FROM {table_function}").strip()
        == "123\tstring\t2000-01-01\t['str1','str2']\ttrue"
    )

    assert instance.query(f"DESCRIBE {table_function} FORMAT TSV") == TSV(
        [
            ["_hoodie_commit_time", "Nullable(String)"],
            ["_hoodie_commit_seqno", "Nullable(String)"],
            ["_hoodie_record_key", "Nullable(String)"],
            ["_hoodie_partition_path", "Nullable(String)"],
            ["_hoodie_file_name", "Nullable(String)"],
            ["a", "Nullable(Int32)"],
            ["b", "Nullable(String)"],
            ["c", "Nullable(Date32)"],
            ["d", "Array(Nullable(String))"],
            ["e", "Nullable(Bool)"],
        ]
    )
Add test template 2022-08-30 09:14:05 +00:00			`import logging`
Hudi tests 2023-03-21 11:51:14 +00:00			`import pytest`
Add test template 2022-08-30 09:14:05 +00:00			`import os`
Add createquery test 2022-08-30 17:38:57 +00:00			`import json`
Add test template 2022-08-30 09:14:05 +00:00
			`import helpers.client`
Fix all problems in tests that had been found by flake8 Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> 2023-12-29 14:02:11 +00:00			`from helpers.cluster import ClickHouseCluster, ClickHouseInstance`
Add select test 2022-08-31 09:26:53 +00:00			`from helpers.test_tools import TSV`
Hudi tests 2023-03-21 11:51:14 +00:00			`from helpers.s3_tools import prepare_s3_bucket, upload_directory, get_file_contents`
Add test template 2022-08-30 09:14:05 +00:00
Hudi tests 2023-03-21 11:51:14 +00:00			`import pyspark`
Better 2023-03-29 18:01:21 +00:00			`from pyspark.sql.types import (`
			`StructType,`
			`StructField,`
			`StringType,`
			`IntegerType,`
			`DateType,`
			`TimestampType,`
			`BooleanType,`
			`ArrayType,`
			`)`
			`from pyspark.sql.functions import current_timestamp`
			`from datetime import datetime`
			`from pyspark.sql.functions import monotonically_increasing_id, row_number`
			`from pyspark.sql.window import Window`

Apply black formatter to tests 2022-09-06 18:05:33 +00:00
Merge remote-tracking branch 'upstream/master' into better-tests-for-data-lakes 2023-04-13 13:13:56 +00:00			`SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))`
Fix style check, black check 2023-03-21 21:17:59 +00:00
Add test template 2022-08-30 09:14:05 +00:00
Close spark session 2023-04-11 15:23:05 +00:00			`def get_spark():`
			`builder = (`
			`pyspark.sql.SparkSession.builder.appName("spark_test")`
			`.config(`
			`"org.apache.spark.sql.hudi.catalog.HoodieCatalog",`
			`)`
			`.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")`
			`.config(`
			`"spark.sql.catalog.local", "org.apache.spark.sql.hudi.catalog.HoodieCatalog"`
			`)`
			`.config("spark.driver.memory", "20g")`
			`.master("local")`
			`)`
Better 2023-04-12 12:38:39 +00:00			`return builder.getOrCreate()`
Close spark session 2023-04-11 15:23:05 +00:00

Add test template 2022-08-30 09:14:05 +00:00			`@pytest.fixture(scope="module")`
			`def started_cluster():`
Merge remote-tracking branch 'upstream/master' into better-tests-for-data-lakes 2023-04-13 13:13:56 +00:00			`cluster = ClickHouseCluster(__file__, with_spark=True)`
Add test template 2022-08-30 09:14:05 +00:00			`try:`
Better 2023-03-29 18:01:21 +00:00			`cluster.add_instance(`
			`"node1",`
			`main_configs=["configs/config.d/named_collections.xml"],`
Fix tests 2023-06-14 11:45:53 +00:00			`user_configs=["configs/users.d/users.xml"],`
Better 2023-03-29 18:01:21 +00:00			`with_minio=True,`
			`)`
Add test template 2022-08-30 09:14:05 +00:00
			`logging.info("Starting cluster...")`
			`cluster.start()`

			`prepare_s3_bucket(cluster)`
			`logging.info("S3 bucket created")`

Close spark session 2023-04-11 15:23:05 +00:00			`cluster.spark_session = get_spark()`

Add test template 2022-08-30 09:14:05 +00:00			`yield cluster`
Apply black formatter to tests 2022-09-06 18:05:33 +00:00
Add test template 2022-08-30 09:14:05 +00:00			`finally:`
			`cluster.shutdown()`

Apply black formatter to tests 2022-09-06 18:05:33 +00:00
Add test template 2022-08-30 09:14:05 +00:00			`def run_query(instance, query, stdin=None, settings=None):`
			`# type: (ClickHouseInstance, str, object, dict) -> str`

			`logging.info("Running query '{}'...".format(query))`
			`result = instance.query(query, stdin=stdin, settings=settings)`
			`logging.info("Query finished")`

			`return result`


Better 2023-03-29 18:01:21 +00:00			`def write_hudi_from_df(spark, table_name, df, result_path, mode="overwrite"):`
Update integration tests runner too 2023-06-21 14:33:43 +00:00			`if mode == "overwrite":`
Better 2023-03-29 18:01:21 +00:00			`hudi_write_mode = "insert_overwrite"`
			`else:`
			`hudi_write_mode = "upsert"`

			`df.write.mode(mode).option("compression", "none").option(`
Hudi tests 2023-03-21 11:51:14 +00:00			`"compression", "none"`
Better 2023-03-29 18:01:21 +00:00			`).format("hudi").option("hoodie.table.name", table_name).option(`
Hudi tests 2023-03-21 11:51:14 +00:00			`"hoodie.datasource.write.partitionpath.field", "partitionpath"`
			`).option(`
Better 2023-03-29 18:01:21 +00:00			`"hoodie.datasource.write.table.name", table_name`
Hudi tests 2023-03-21 11:51:14 +00:00			`).option(`
Better 2023-03-29 18:01:21 +00:00			`"hoodie.datasource.write.operation", hudi_write_mode`
Fixes for hudi 2023-03-30 16:25:54 +00:00			`).option(`
			`"hoodie.datasource.compaction.async.enable", "true"`
			`).option(`
			`"hoodie.compact.inline", "false"`
			`).option(`
			`"hoodie.compact.inline.max.delta.commits", "10"`
Hudi tests 2023-03-21 11:51:14 +00:00			`).option(`
			`"hoodie.parquet.compression.codec", "snappy"`
			`).option(`
			`"hoodie.hfile.compression.algorithm", "uncompressed"`
			`).option(`
			`"hoodie.datasource.write.recordkey.field", "a"`
			`).option(`
			`"hoodie.datasource.write.precombine.field", "a"`
			`).save(`
			`result_path`
Apply black formatter to tests 2022-09-06 18:05:33 +00:00			`)`
fix and add test 2022-11-09 10:04:53 +00:00

Better 2023-03-29 18:01:21 +00:00			`def write_hudi_from_file(spark, table_name, path, result_path):`
			`spark.conf.set("spark.sql.debug.maxToStringFields", 100000)`
			`df = spark.read.load(f"file://{path}")`
			`write_hudi_from_df(spark, table_name, df, result_path)`


Fixes for hudi 2023-03-30 16:25:54 +00:00			`def generate_data(spark, start, end, append=1):`
Better 2023-03-29 18:01:21 +00:00			`a = spark.range(start, end, 1).toDF("a")`
Fixes for hudi 2023-03-30 16:25:54 +00:00			`b = spark.range(start + append, end + append, 1).toDF("b")`
Better 2023-03-29 18:01:21 +00:00			`b = b.withColumn("b", b["b"].cast(StringType()))`

			`a = a.withColumn(`
			`"row_index", row_number().over(Window.orderBy(monotonically_increasing_id()))`
			`)`
			`b = b.withColumn(`
			`"row_index", row_number().over(Window.orderBy(monotonically_increasing_id()))`
			`)`

			`df = a.join(b, on=["row_index"]).drop("row_index")`
			`return df`


			`def create_hudi_table(node, table_name):`
			`node.query(`
			`f"""`
			`DROP TABLE IF EXISTS {table_name};`
			`CREATE TABLE {table_name}`
			`ENGINE=Hudi(s3, filename = '{table_name}/')"""`
			`)`


Fix 2023-04-05 18:32:37 +00:00			`def create_initial_data_file(`
			`cluster, node, query, table_name, compression_method="none"`
			`):`
Better 2023-03-29 18:01:21 +00:00			`node.query(`
			`f"""`
			`INSERT INTO TABLE FUNCTION`
			`file('{table_name}.parquet')`
			`SETTINGS`
			`output_format_parquet_compression_method='{compression_method}',`
			`s3_truncate_on_insert=1 {query}`
			`FORMAT Parquet"""`
			`)`
Fix 2023-04-05 18:32:37 +00:00			`user_files_path = os.path.join(`
			`SCRIPT_DIR, f"{cluster.instances_dir_name}/node1/database/user_files"`
			`)`
			`result_path = f"{user_files_path}/{table_name}.parquet"`
Better 2023-03-29 18:01:21 +00:00			`return result_path`


			`def test_single_hudi_file(started_cluster):`
Hudi tests 2023-03-21 11:51:14 +00:00			`instance = started_cluster.instances["node1"]`
Close spark session 2023-04-11 15:23:05 +00:00			`spark = started_cluster.spark_session`
Better 2023-03-29 18:01:21 +00:00			`minio_client = started_cluster.minio_client`
			`bucket = started_cluster.minio_bucket`
			`TABLE_NAME = "test_single_hudi_file"`

			`inserted_data = "SELECT number as a, toString(number) as b FROM numbers(100)"`
Fix 2023-04-05 18:32:37 +00:00			`parquet_data_path = create_initial_data_file(`
			`started_cluster, instance, inserted_data, TABLE_NAME`
			`)`
Better 2023-03-29 18:01:21 +00:00
			`write_hudi_from_file(spark, TABLE_NAME, parquet_data_path, f"/{TABLE_NAME}")`
			`files = upload_directory(minio_client, bucket, f"/{TABLE_NAME}", "")`
			`assert len(files) == 1`
			`assert files[0].endswith(".parquet")`
Hudi tests 2023-03-21 11:51:14 +00:00
Better 2023-03-29 18:01:21 +00:00			`create_hudi_table(instance, TABLE_NAME)`
			`assert instance.query(f"SELECT a, b FROM {TABLE_NAME}") == instance.query(`
			`inserted_data`
fix style fix style 2022-11-10 03:25:40 +00:00			`)`
Fix bug in test 2022-08-31 09:47:46 +00:00
Fix DESCRIBE for deltaLake and hudi table functions 2022-11-17 11:46:17 +00:00
Better 2023-03-29 18:01:21 +00:00			`def test_multiple_hudi_files(started_cluster):`
			`instance = started_cluster.instances["node1"]`
Close spark session 2023-04-11 15:23:05 +00:00			`spark = started_cluster.spark_session`
Better 2023-03-29 18:01:21 +00:00			`minio_client = started_cluster.minio_client`
			`bucket = started_cluster.minio_bucket`
			`TABLE_NAME = "test_multiple_hudi_files"`

Fix black check 2023-03-29 21:54:44 +00:00			`write_hudi_from_df(`
			`spark, TABLE_NAME, generate_data(spark, 0, 100), f"/{TABLE_NAME}"`
			`)`
Better 2023-03-29 18:01:21 +00:00			`files = upload_directory(minio_client, bucket, f"/{TABLE_NAME}", "")`
			`assert len(files) == 1`

			`create_hudi_table(instance, TABLE_NAME)`
			`assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100`
Automatic style fix 2022-11-17 11:54:13 +00:00
Better 2023-03-29 18:01:21 +00:00			`write_hudi_from_df(`
Fix black check 2023-03-29 21:54:44 +00:00			`spark,`
			`TABLE_NAME,`
			`generate_data(spark, 100, 200),`
			`f"/{TABLE_NAME}",`
			`mode="append",`
Better 2023-03-29 18:01:21 +00:00			`)`
			`files = upload_directory(minio_client, bucket, f"/{TABLE_NAME}", "")`
			`assert len(files) == 2`

			`assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 200`
Fix black check 2023-03-29 21:54:44 +00:00			`assert instance.query(`
			`f"SELECT a, b FROM {TABLE_NAME} ORDER BY 1"`
			`) == instance.query("SELECT number, toString(number + 1) FROM numbers(200)")`
Better 2023-03-29 18:01:21 +00:00
			`write_hudi_from_df(`
Fix black check 2023-03-29 21:54:44 +00:00			`spark,`
			`TABLE_NAME,`
			`generate_data(spark, 100, 300),`
			`f"/{TABLE_NAME}",`
			`mode="append",`
Better 2023-03-29 18:01:21 +00:00			`)`
			`files = upload_directory(minio_client, bucket, f"/{TABLE_NAME}", "")`
Fixes for hudi 2023-03-30 16:25:54 +00:00			`assert len(files) == 3`
Better 2023-03-29 18:01:21 +00:00
			`assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 300`
Fix black check 2023-03-29 21:54:44 +00:00			`assert instance.query(`
			`f"SELECT a, b FROM {TABLE_NAME} ORDER BY 1"`
			`) == instance.query("SELECT number, toString(number + 1) FROM numbers(300)")`
Better 2023-03-29 18:01:21 +00:00
Fixes for hudi 2023-03-30 16:25:54 +00:00			`assert int(instance.query(f"SELECT b FROM {TABLE_NAME} WHERE a = 100")) == 101`
			`write_hudi_from_df(`
			`spark,`
			`TABLE_NAME,`
			`generate_data(spark, 100, 101, append=0),`
			`f"/{TABLE_NAME}",`
			`mode="append",`
			`)`
			`files = upload_directory(minio_client, bucket, f"/{TABLE_NAME}", "")`

			`assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 300`
			`assert int(instance.query(f"SELECT b FROM {TABLE_NAME} WHERE a = 100")) == 100`

			`write_hudi_from_df(`
			`spark,`
			`TABLE_NAME,`
			`generate_data(spark, 100, 1000000, append=0),`
			`f"/{TABLE_NAME}",`
			`mode="append",`
			`)`
			`files = upload_directory(minio_client, bucket, f"/{TABLE_NAME}", "")`
			`assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 1000000`

Better 2023-03-29 18:01:21 +00:00
			`def test_types(started_cluster):`
			`instance = started_cluster.instances["node1"]`
Close spark session 2023-04-11 15:23:05 +00:00			`spark = started_cluster.spark_session`
Hudi tests 2023-03-21 11:51:14 +00:00			`minio_client = started_cluster.minio_client`
Fix DESCRIBE for deltaLake and hudi table functions 2022-11-17 11:46:17 +00:00			`bucket = started_cluster.minio_bucket`
Better 2023-03-29 18:01:21 +00:00			`TABLE_NAME = "test_types"`
Fix DESCRIBE for deltaLake and hudi table functions 2022-11-17 11:46:17 +00:00
Better 2023-03-29 18:01:21 +00:00			`data = [`
			`(`
			`123,`
			`"string",`
			`datetime.strptime("2000-01-01", "%Y-%m-%d"),`
			`["str1", "str2"],`
			`True,`
			`)`
			`]`
			`schema = StructType(`
			`[`
			`StructField("a", IntegerType()),`
			`StructField("b", StringType()),`
			`StructField("c", DateType()),`
			`StructField("d", ArrayType(StringType())),`
			`StructField("e", BooleanType()),`
			`]`
Hudi tests 2023-03-21 11:51:14 +00:00			`)`
Better 2023-03-29 18:01:21 +00:00			`df = spark.createDataFrame(data=data, schema=schema)`
			`df.printSchema()`
Fix black check 2023-03-29 21:54:44 +00:00			`write_hudi_from_df(spark, TABLE_NAME, df, f"/{TABLE_NAME}", mode="overwrite")`
Better 2023-03-29 18:01:21 +00:00
			`upload_directory(minio_client, bucket, f"/{TABLE_NAME}", "")`

			`create_hudi_table(instance, TABLE_NAME)`
			`assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 1`
			`assert (`
Better 2023-03-30 21:09:12 +00:00			`instance.query(f"SELECT a, b, c, d, e FROM {TABLE_NAME}").strip()`
Better 2023-03-29 18:01:21 +00:00			`== "123\tstring\t2000-01-01\t['str1','str2']\ttrue"`
Fix DESCRIBE for deltaLake and hudi table functions 2022-11-17 11:46:17 +00:00			`)`
Better 2023-03-29 18:01:21 +00:00
Better 2023-04-03 14:57:49 +00:00			`table_function = f"hudi(s3, filename='{TABLE_NAME}/')"`
Better 2023-03-30 21:09:12 +00:00			`assert (`
Black cehck 2023-03-31 11:01:40 +00:00			`instance.query(f"SELECT a, b, c, d, e FROM {table_function}").strip()`
			`== "123\tstring\t2000-01-01\t['str1','str2']\ttrue"`
Better 2023-03-30 21:09:12 +00:00			`)`

			`assert instance.query(f"DESCRIBE {table_function} FORMAT TSV") == TSV(`
Black cehck 2023-03-31 11:01:40 +00:00			`[`
Fix 2023-04-06 12:14:22 +00:00			`["_hoodie_commit_time", "Nullable(String)"],`
			`["_hoodie_commit_seqno", "Nullable(String)"],`
			`["_hoodie_record_key", "Nullable(String)"],`
			`["_hoodie_partition_path", "Nullable(String)"],`
			`["_hoodie_file_name", "Nullable(String)"],`
Black cehck 2023-03-31 11:01:40 +00:00			`["a", "Nullable(Int32)"],`
			`["b", "Nullable(String)"],`
			`["c", "Nullable(Date32)"],`
			`["d", "Array(Nullable(String))"],`
			`["e", "Nullable(Bool)"],`
			`]`
Better 2023-03-30 21:09:12 +00:00			`)`