ClickHouse/tests/integration/test_storage_hudi/test.py

152 lines
4.3 KiB
Python
Raw Normal View History

2022-08-30 09:14:05 +00:00
import logging
import os
2022-08-30 17:38:57 +00:00
import json
2022-08-30 09:14:05 +00:00
import helpers.client
import pytest
from helpers.cluster import ClickHouseCluster
2022-08-31 09:26:53 +00:00
from helpers.test_tools import TSV
2022-08-30 09:14:05 +00:00
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
2022-09-06 18:05:33 +00:00
2022-08-30 09:14:05 +00:00
def prepare_s3_bucket(started_cluster):
bucket_read_write_policy = {
"Version": "2012-10-17",
"Statement": [
{
"Sid": "",
"Effect": "Allow",
"Principal": {"AWS": "*"},
"Action": "s3:GetBucketLocation",
"Resource": "arn:aws:s3:::root",
},
{
"Sid": "",
"Effect": "Allow",
"Principal": {"AWS": "*"},
"Action": "s3:ListBucket",
"Resource": "arn:aws:s3:::root",
},
{
"Sid": "",
"Effect": "Allow",
"Principal": {"AWS": "*"},
"Action": "s3:GetObject",
"Resource": "arn:aws:s3:::root/*",
},
{
"Sid": "",
"Effect": "Allow",
"Principal": {"AWS": "*"},
"Action": "s3:PutObject",
"Resource": "arn:aws:s3:::root/*",
},
],
}
minio_client = started_cluster.minio_client
minio_client.set_bucket_policy(
started_cluster.minio_bucket, json.dumps(bucket_read_write_policy)
)
2022-09-06 18:05:33 +00:00
2022-08-30 09:14:05 +00:00
def upload_test_table(started_cluster):
bucket = started_cluster.minio_bucket
2022-08-30 17:38:57 +00:00
for address, dirs, files in os.walk(SCRIPT_DIR + "/test_table"):
2022-09-06 18:05:33 +00:00
address_without_prefix = address[len(SCRIPT_DIR) :]
2022-08-30 17:38:57 +00:00
2022-08-30 09:14:05 +00:00
for name in files:
2022-09-06 18:05:33 +00:00
started_cluster.minio_client.fput_object(
bucket,
os.path.join(address_without_prefix, name),
os.path.join(address, name),
)
2022-08-30 09:14:05 +00:00
@pytest.fixture(scope="module")
def started_cluster():
try:
cluster = ClickHouseCluster(__file__)
2022-09-06 18:05:33 +00:00
cluster.add_instance("main_server", with_minio=True)
2022-08-30 09:14:05 +00:00
logging.info("Starting cluster...")
cluster.start()
prepare_s3_bucket(cluster)
logging.info("S3 bucket created")
upload_test_table(cluster)
logging.info("Test table uploaded")
yield cluster
2022-09-06 18:05:33 +00:00
2022-08-30 09:14:05 +00:00
finally:
cluster.shutdown()
2022-09-06 18:05:33 +00:00
2022-08-30 09:14:05 +00:00
def run_query(instance, query, stdin=None, settings=None):
# type: (ClickHouseInstance, str, object, dict) -> str
logging.info("Running query '{}'...".format(query))
result = instance.query(query, stdin=stdin, settings=settings)
logging.info("Query finished")
return result
def test_create_query(started_cluster):
2022-08-31 09:26:53 +00:00
instance = started_cluster.instances["main_server"]
2022-08-30 09:14:05 +00:00
bucket = started_cluster.minio_bucket
2022-08-30 17:38:57 +00:00
create_query = f"""CREATE TABLE hudi ENGINE=Hudi('http://{started_cluster.minio_ip}:{started_cluster.minio_port}/{bucket}/test_table/', 'minio', 'minio123')"""
2022-08-30 09:14:05 +00:00
run_query(instance, create_query)
2022-09-06 18:05:33 +00:00
2022-08-31 09:26:53 +00:00
def test_select_query(started_cluster):
instance = started_cluster.instances["main_server"]
bucket = started_cluster.minio_bucket
2022-09-06 18:05:33 +00:00
columns = [
"_hoodie_commit_time",
"_hoodie_commit_seqno",
"_hoodie_record_key",
"_hoodie_partition_path",
"_hoodie_file_name",
"begin_lat",
"begin_lon",
"driver",
"end_lat",
"end_lon",
"fare",
"partitionpath",
"rider",
"ts",
"uuid",
]
2022-08-31 09:26:53 +00:00
# create query in case table doesn't exist
create_query = f"""CREATE TABLE IF NOT EXISTS hudi ENGINE=Hudi('http://{started_cluster.minio_ip}:{started_cluster.minio_port}/{bucket}/test_table/', 'minio', 'minio123')"""
run_query(instance, create_query)
select_query = "SELECT {} FROM hudi FORMAT TSV"
for column_name in columns:
result = run_query(instance, select_query.format(column_name)).splitlines()
2022-09-06 18:05:33 +00:00
assert len(result) > 0
2022-08-31 09:26:53 +00:00
2022-09-06 18:05:33 +00:00
# test if all partition paths is presented in result
distinct_select_query = (
"SELECT DISTINCT partitionpath FROM hudi ORDER BY partitionpath FORMAT TSV"
)
2022-08-31 09:47:46 +00:00
result = run_query(instance, distinct_select_query)
2022-09-06 18:05:33 +00:00
expected = [
"americas/brazil/sao_paulo",
"americas/united_states/san_francisco",
"asia/india/chennai",
]
2022-08-31 09:47:46 +00:00
2022-09-06 18:05:33 +00:00
assert TSV(result) == TSV(expected)