2022-08-30 09:14:05 +00:00
import logging
import os
2022-08-30 17:38:57 +00:00
import json
2022-08-30 09:14:05 +00:00
import helpers . client
import pytest
from helpers . cluster import ClickHouseCluster
2022-08-31 09:26:53 +00:00
from helpers . test_tools import TSV
2022-08-30 09:14:05 +00:00
SCRIPT_DIR = os . path . dirname ( os . path . realpath ( __file__ ) )
2022-09-06 18:05:33 +00:00
2022-08-30 09:14:05 +00:00
def prepare_s3_bucket ( started_cluster ) :
bucket_read_write_policy = {
" Version " : " 2012-10-17 " ,
" Statement " : [
{
" Sid " : " " ,
" Effect " : " Allow " ,
" Principal " : { " AWS " : " * " } ,
" Action " : " s3:GetBucketLocation " ,
" Resource " : " arn:aws:s3:::root " ,
} ,
{
" Sid " : " " ,
" Effect " : " Allow " ,
" Principal " : { " AWS " : " * " } ,
" Action " : " s3:ListBucket " ,
" Resource " : " arn:aws:s3:::root " ,
} ,
{
" Sid " : " " ,
" Effect " : " Allow " ,
" Principal " : { " AWS " : " * " } ,
" Action " : " s3:GetObject " ,
" Resource " : " arn:aws:s3:::root/* " ,
} ,
{
" Sid " : " " ,
" Effect " : " Allow " ,
" Principal " : { " AWS " : " * " } ,
" Action " : " s3:PutObject " ,
" Resource " : " arn:aws:s3:::root/* " ,
} ,
] ,
}
minio_client = started_cluster . minio_client
minio_client . set_bucket_policy (
started_cluster . minio_bucket , json . dumps ( bucket_read_write_policy )
)
2022-09-06 18:05:33 +00:00
2022-08-30 09:14:05 +00:00
def upload_test_table ( started_cluster ) :
bucket = started_cluster . minio_bucket
2022-08-30 17:38:57 +00:00
for address , dirs , files in os . walk ( SCRIPT_DIR + " /test_table " ) :
2022-09-06 18:05:33 +00:00
address_without_prefix = address [ len ( SCRIPT_DIR ) : ]
2022-08-30 17:38:57 +00:00
2022-08-30 09:14:05 +00:00
for name in files :
2022-09-06 18:05:33 +00:00
started_cluster . minio_client . fput_object (
bucket ,
os . path . join ( address_without_prefix , name ) ,
os . path . join ( address , name ) ,
)
2022-08-30 09:14:05 +00:00
@pytest.fixture ( scope = " module " )
def started_cluster ( ) :
try :
cluster = ClickHouseCluster ( __file__ )
2022-09-06 18:05:33 +00:00
cluster . add_instance ( " main_server " , with_minio = True )
2022-08-30 09:14:05 +00:00
logging . info ( " Starting cluster... " )
cluster . start ( )
prepare_s3_bucket ( cluster )
logging . info ( " S3 bucket created " )
upload_test_table ( cluster )
logging . info ( " Test table uploaded " )
yield cluster
2022-09-06 18:05:33 +00:00
2022-08-30 09:14:05 +00:00
finally :
cluster . shutdown ( )
2022-09-06 18:05:33 +00:00
2022-08-30 09:14:05 +00:00
def run_query ( instance , query , stdin = None , settings = None ) :
# type: (ClickHouseInstance, str, object, dict) -> str
logging . info ( " Running query ' {} ' ... " . format ( query ) )
result = instance . query ( query , stdin = stdin , settings = settings )
logging . info ( " Query finished " )
return result
def test_create_query ( started_cluster ) :
2022-08-31 09:26:53 +00:00
instance = started_cluster . instances [ " main_server " ]
2022-08-30 09:14:05 +00:00
bucket = started_cluster . minio_bucket
2022-08-30 17:38:57 +00:00
create_query = f """ CREATE TABLE hudi ENGINE=Hudi( ' http:// { started_cluster . minio_ip } : { started_cluster . minio_port } / { bucket } /test_table/ ' , ' minio ' , ' minio123 ' ) """
2022-08-30 09:14:05 +00:00
run_query ( instance , create_query )
2022-09-06 18:05:33 +00:00
2022-08-31 09:26:53 +00:00
def test_select_query ( started_cluster ) :
instance = started_cluster . instances [ " main_server " ]
bucket = started_cluster . minio_bucket
2022-09-06 18:05:33 +00:00
columns = [
" _hoodie_commit_time " ,
" _hoodie_commit_seqno " ,
" _hoodie_record_key " ,
" _hoodie_partition_path " ,
" _hoodie_file_name " ,
" begin_lat " ,
" begin_lon " ,
" driver " ,
" end_lat " ,
" end_lon " ,
" fare " ,
" partitionpath " ,
" rider " ,
" ts " ,
" uuid " ,
]
2022-08-31 09:26:53 +00:00
# create query in case table doesn't exist
create_query = f """ CREATE TABLE IF NOT EXISTS hudi ENGINE=Hudi( ' http:// { started_cluster . minio_ip } : { started_cluster . minio_port } / { bucket } /test_table/ ' , ' minio ' , ' minio123 ' ) """
run_query ( instance , create_query )
select_query = " SELECT {} FROM hudi FORMAT TSV "
2022-11-10 03:10:25 +00:00
select_table_function_query = " SELECT {col} FROM hudi( ' http:// {ip} : {port} / {bucket} /test_table/ ' , ' minio ' , ' minio123 ' ) FORMAT TSV "
2022-11-09 10:04:53 +00:00
2022-08-31 09:26:53 +00:00
for column_name in columns :
result = run_query ( instance , select_query . format ( column_name ) ) . splitlines ( )
2022-09-06 18:05:33 +00:00
assert len ( result ) > 0
2022-08-31 09:26:53 +00:00
2022-11-09 10:04:53 +00:00
for column_name in columns :
2022-11-09 12:42:13 +00:00
result = run_query (
2022-11-10 03:25:40 +00:00
instance ,
select_table_function_query . format (
col = column_name ,
ip = started_cluster . minio_ip ,
port = started_cluster . minio_port ,
bucket = bucket ,
) ,
2022-11-09 12:42:13 +00:00
) . splitlines ( )
2022-11-09 10:04:53 +00:00
assert len ( result ) > 0
2022-09-06 18:05:33 +00:00
# test if all partition paths is presented in result
distinct_select_query = (
" SELECT DISTINCT partitionpath FROM hudi ORDER BY partitionpath FORMAT TSV "
)
2022-11-09 10:04:53 +00:00
2022-11-10 03:10:25 +00:00
distinct_select_table_function_query = " SELECT DISTINCT partitionpath FROM hudi( ' http:// {ip} : {port} / {bucket} /test_table/ ' , ' minio ' , ' minio123 ' ) ORDER BY partitionpath FORMAT TSV "
2022-11-09 10:04:53 +00:00
2022-08-31 09:47:46 +00:00
result = run_query ( instance , distinct_select_query )
2022-11-10 03:25:40 +00:00
result_table_function = run_query (
instance ,
distinct_select_query . format (
ip = started_cluster . minio_ip , port = started_cluster . minio_port , bucket = bucket
) ,
)
2022-09-06 18:05:33 +00:00
expected = [
" americas/brazil/sao_paulo " ,
" americas/united_states/san_francisco " ,
" asia/india/chennai " ,
]
2022-08-31 09:47:46 +00:00
2022-09-06 18:05:33 +00:00
assert TSV ( result ) == TSV ( expected )
2022-11-09 10:04:53 +00:00
assert TSV ( result_table_function ) == TSV ( expected )