ClickHouse/tests/integration/test_storage_hudi/test.py

import logging
import os
import json

import helpers.client
import pytest
from helpers.cluster import ClickHouseCluster
from helpers.test_tools import TSV

SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))


def prepare_s3_bucket(started_cluster):
    bucket_read_write_policy = {
        "Version": "2012-10-17",
        "Statement": [
            {
                "Sid": "",
                "Effect": "Allow",
                "Principal": {"AWS": "*"},
                "Action": "s3:GetBucketLocation",
                "Resource": "arn:aws:s3:::root",
            },
            {
                "Sid": "",
                "Effect": "Allow",
                "Principal": {"AWS": "*"},
                "Action": "s3:ListBucket",
                "Resource": "arn:aws:s3:::root",
            },
            {
                "Sid": "",
                "Effect": "Allow",
                "Principal": {"AWS": "*"},
                "Action": "s3:GetObject",
                "Resource": "arn:aws:s3:::root/*",
            },
            {
                "Sid": "",
                "Effect": "Allow",
                "Principal": {"AWS": "*"},
                "Action": "s3:PutObject",
                "Resource": "arn:aws:s3:::root/*",
            },
        ],
    }

    minio_client = started_cluster.minio_client
    minio_client.set_bucket_policy(
        started_cluster.minio_bucket, json.dumps(bucket_read_write_policy)
    )


def upload_test_table(started_cluster):
    bucket = started_cluster.minio_bucket

    for address, dirs, files in os.walk(SCRIPT_DIR + "/test_table"):
        address_without_prefix = address[len(SCRIPT_DIR) :]

        for name in files:
            started_cluster.minio_client.fput_object(
                bucket,
                os.path.join(address_without_prefix, name),
                os.path.join(address, name),
            )


@pytest.fixture(scope="module")
def started_cluster():
    try:
        cluster = ClickHouseCluster(__file__)
        cluster.add_instance("main_server", with_minio=True)

        logging.info("Starting cluster...")
        cluster.start()

        prepare_s3_bucket(cluster)
        logging.info("S3 bucket created")

        upload_test_table(cluster)
        logging.info("Test table uploaded")

        yield cluster

    finally:
        cluster.shutdown()


def run_query(instance, query, stdin=None, settings=None):
    # type: (ClickHouseInstance, str, object, dict) -> str

    logging.info("Running query '{}'...".format(query))
    result = instance.query(query, stdin=stdin, settings=settings)
    logging.info("Query finished")

    return result


def test_create_query(started_cluster):
    instance = started_cluster.instances["main_server"]
    bucket = started_cluster.minio_bucket

    create_query = f"""CREATE TABLE hudi ENGINE=Hudi('http://{started_cluster.minio_ip}:{started_cluster.minio_port}/{bucket}/test_table/', 'minio', 'minio123')"""

    run_query(instance, create_query)


def test_select_query(started_cluster):
    instance = started_cluster.instances["main_server"]
    bucket = started_cluster.minio_bucket
    columns = [
        "_hoodie_commit_time",
        "_hoodie_commit_seqno",
        "_hoodie_record_key",
        "_hoodie_partition_path",
        "_hoodie_file_name",
        "begin_lat",
        "begin_lon",
        "driver",
        "end_lat",
        "end_lon",
        "fare",
        "partitionpath",
        "rider",
        "ts",
        "uuid",
    ]

    # create query in case table doesn't exist
    create_query = f"""CREATE TABLE IF NOT EXISTS hudi ENGINE=Hudi('http://{started_cluster.minio_ip}:{started_cluster.minio_port}/{bucket}/test_table/', 'minio', 'minio123')"""

    run_query(instance, create_query)

    select_query = "SELECT {} FROM hudi FORMAT TSV"

    for column_name in columns:
        result = run_query(instance, select_query.format(column_name)).splitlines()
        assert len(result) > 0

    # test if all partition paths is presented in result
    distinct_select_query = (
        "SELECT DISTINCT partitionpath FROM hudi ORDER BY partitionpath FORMAT TSV"
    )
    result = run_query(instance, distinct_select_query)
    expected = [
        "americas/brazil/sao_paulo",
        "americas/united_states/san_francisco",
        "asia/india/chennai",
    ]

    assert TSV(result) == TSV(expected)
Add test template 2022-08-30 09:14:05 +00:00			`import logging`
			`import os`
Add createquery test 2022-08-30 17:38:57 +00:00			`import json`
Add test template 2022-08-30 09:14:05 +00:00
			`import helpers.client`
			`import pytest`
			`from helpers.cluster import ClickHouseCluster`
Add select test 2022-08-31 09:26:53 +00:00			`from helpers.test_tools import TSV`
Add test template 2022-08-30 09:14:05 +00:00
			`SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))`

Apply black formatter to tests 2022-09-06 18:05:33 +00:00
Add test template 2022-08-30 09:14:05 +00:00			`def prepare_s3_bucket(started_cluster):`
			`bucket_read_write_policy = {`
			`"Version": "2012-10-17",`
			`"Statement": [`
			`{`
			`"Sid": "",`
			`"Effect": "Allow",`
			`"Principal": {"AWS": "*"},`
			`"Action": "s3:GetBucketLocation",`
			`"Resource": "arn:aws:s3:::root",`
			`},`
			`{`
			`"Sid": "",`
			`"Effect": "Allow",`
			`"Principal": {"AWS": "*"},`
			`"Action": "s3:ListBucket",`
			`"Resource": "arn:aws:s3:::root",`
			`},`
			`{`
			`"Sid": "",`
			`"Effect": "Allow",`
			`"Principal": {"AWS": "*"},`
			`"Action": "s3:GetObject",`
			`"Resource": "arn:aws:s3:::root/*",`
			`},`
			`{`
			`"Sid": "",`
			`"Effect": "Allow",`
			`"Principal": {"AWS": "*"},`
			`"Action": "s3:PutObject",`
			`"Resource": "arn:aws:s3:::root/*",`
			`},`
			`],`
			`}`

			`minio_client = started_cluster.minio_client`
			`minio_client.set_bucket_policy(`
			`started_cluster.minio_bucket, json.dumps(bucket_read_write_policy)`
			`)`

Apply black formatter to tests 2022-09-06 18:05:33 +00:00
Add test template 2022-08-30 09:14:05 +00:00			`def upload_test_table(started_cluster):`
			`bucket = started_cluster.minio_bucket`

Add createquery test 2022-08-30 17:38:57 +00:00			`for address, dirs, files in os.walk(SCRIPT_DIR + "/test_table"):`
Apply black formatter to tests 2022-09-06 18:05:33 +00:00			`address_without_prefix = address[len(SCRIPT_DIR) :]`
Add createquery test 2022-08-30 17:38:57 +00:00
Add test template 2022-08-30 09:14:05 +00:00			`for name in files:`
Apply black formatter to tests 2022-09-06 18:05:33 +00:00			`started_cluster.minio_client.fput_object(`
			`bucket,`
			`os.path.join(address_without_prefix, name),`
			`os.path.join(address, name),`
			`)`

Add test template 2022-08-30 09:14:05 +00:00
			`@pytest.fixture(scope="module")`
			`def started_cluster():`
			`try:`
			`cluster = ClickHouseCluster(__file__)`
Apply black formatter to tests 2022-09-06 18:05:33 +00:00			`cluster.add_instance("main_server", with_minio=True)`
Add test template 2022-08-30 09:14:05 +00:00
			`logging.info("Starting cluster...")`
			`cluster.start()`

			`prepare_s3_bucket(cluster)`
			`logging.info("S3 bucket created")`

			`upload_test_table(cluster)`
			`logging.info("Test table uploaded")`

			`yield cluster`
Apply black formatter to tests 2022-09-06 18:05:33 +00:00
Add test template 2022-08-30 09:14:05 +00:00			`finally:`
			`cluster.shutdown()`

Apply black formatter to tests 2022-09-06 18:05:33 +00:00
Add test template 2022-08-30 09:14:05 +00:00			`def run_query(instance, query, stdin=None, settings=None):`
			`# type: (ClickHouseInstance, str, object, dict) -> str`

			`logging.info("Running query '{}'...".format(query))`
			`result = instance.query(query, stdin=stdin, settings=settings)`
			`logging.info("Query finished")`

			`return result`


			`def test_create_query(started_cluster):`
Add select test 2022-08-31 09:26:53 +00:00			`instance = started_cluster.instances["main_server"]`
Add test template 2022-08-30 09:14:05 +00:00			`bucket = started_cluster.minio_bucket`

Add createquery test 2022-08-30 17:38:57 +00:00			`create_query = f"""CREATE TABLE hudi ENGINE=Hudi('http://{started_cluster.minio_ip}:{started_cluster.minio_port}/{bucket}/test_table/', 'minio', 'minio123')"""`
Add test template 2022-08-30 09:14:05 +00:00
			`run_query(instance, create_query)`

Apply black formatter to tests 2022-09-06 18:05:33 +00:00
Add select test 2022-08-31 09:26:53 +00:00			`def test_select_query(started_cluster):`
			`instance = started_cluster.instances["main_server"]`
			`bucket = started_cluster.minio_bucket`
Apply black formatter to tests 2022-09-06 18:05:33 +00:00			`columns = [`
			`"_hoodie_commit_time",`
			`"_hoodie_commit_seqno",`
			`"_hoodie_record_key",`
			`"_hoodie_partition_path",`
			`"_hoodie_file_name",`
			`"begin_lat",`
			`"begin_lon",`
			`"driver",`
			`"end_lat",`
			`"end_lon",`
			`"fare",`
			`"partitionpath",`
			`"rider",`
			`"ts",`
			`"uuid",`
			`]`

Add select test 2022-08-31 09:26:53 +00:00			`# create query in case table doesn't exist`
			`create_query = f"""CREATE TABLE IF NOT EXISTS hudi ENGINE=Hudi('http://{started_cluster.minio_ip}:{started_cluster.minio_port}/{bucket}/test_table/', 'minio', 'minio123')"""`

			`run_query(instance, create_query)`

			`select_query = "SELECT {} FROM hudi FORMAT TSV"`

			`for column_name in columns:`
			`result = run_query(instance, select_query.format(column_name)).splitlines()`
Apply black formatter to tests 2022-09-06 18:05:33 +00:00			`assert len(result) > 0`
Add select test 2022-08-31 09:26:53 +00:00
Apply black formatter to tests 2022-09-06 18:05:33 +00:00			`# test if all partition paths is presented in result`
			`distinct_select_query = (`
			`"SELECT DISTINCT partitionpath FROM hudi ORDER BY partitionpath FORMAT TSV"`
			`)`
Fix bug in test 2022-08-31 09:47:46 +00:00			`result = run_query(instance, distinct_select_query)`
Apply black formatter to tests 2022-09-06 18:05:33 +00:00			`expected = [`
			`"americas/brazil/sao_paulo",`
			`"americas/united_states/san_francisco",`
			`"asia/india/chennai",`
			`]`
Fix bug in test 2022-08-31 09:47:46 +00:00
Apply black formatter to tests 2022-09-06 18:05:33 +00:00			`assert TSV(result) == TSV(expected)`