2020-09-28 23:30:41 +00:00
import gzip
2019-09-19 09:34:33 +00:00
import json
import logging
2020-09-16 04:26:10 +00:00
import os
2020-10-02 16:54:07 +00:00
import io
2020-01-27 21:44:18 +00:00
import random
2020-05-25 09:15:11 +00:00
import threading
2020-09-30 13:09:55 +00:00
import time
2019-09-19 09:34:33 +00:00
2020-09-16 04:26:10 +00:00
import helpers . client
2019-06-26 00:41:14 +00:00
import pytest
2019-11-20 11:56:38 +00:00
from helpers . cluster import ClickHouseCluster , ClickHouseInstance
2019-09-19 09:34:33 +00:00
2021-02-20 14:59:39 +00:00
MINIO_INTERNAL_PORT = 9001
2019-09-19 09:34:33 +00:00
2021-03-04 15:56:55 +00:00
SCRIPT_DIR = os . path . dirname ( os . path . realpath ( __file__ ) )
CONFIG_PATH = os . path . join ( SCRIPT_DIR , ' ./_instances/dummy/configs/config.d/defaultS3.xml ' )
2019-09-19 09:34:33 +00:00
2019-11-20 11:56:38 +00:00
# Creates S3 bucket for tests and allows anonymous read-write access to it.
2021-02-20 14:59:39 +00:00
def prepare_s3_bucket ( started_cluster ) :
2019-11-20 11:56:38 +00:00
# Allows read-write access for bucket without authorization.
bucket_read_write_policy = { " Version " : " 2012-10-17 " ,
" Statement " : [
{
" Sid " : " " ,
" Effect " : " Allow " ,
" Principal " : { " AWS " : " * " } ,
" Action " : " s3:GetBucketLocation " ,
" Resource " : " arn:aws:s3:::root "
} ,
{
" Sid " : " " ,
" Effect " : " Allow " ,
" Principal " : { " AWS " : " * " } ,
" Action " : " s3:ListBucket " ,
" Resource " : " arn:aws:s3:::root "
} ,
{
" Sid " : " " ,
" Effect " : " Allow " ,
" Principal " : { " AWS " : " * " } ,
" Action " : " s3:GetObject " ,
" Resource " : " arn:aws:s3:::root/* "
} ,
{
" Sid " : " " ,
" Effect " : " Allow " ,
" Principal " : { " AWS " : " * " } ,
" Action " : " s3:PutObject " ,
" Resource " : " arn:aws:s3:::root/* "
}
] }
2021-02-20 14:59:39 +00:00
minio_client = started_cluster . minio_client
minio_client . set_bucket_policy ( started_cluster . minio_bucket , json . dumps ( bucket_read_write_policy ) )
2019-11-20 11:56:38 +00:00
2021-02-20 14:59:39 +00:00
started_cluster . minio_restricted_bucket = " {} -with-auth " . format ( started_cluster . minio_bucket )
if minio_client . bucket_exists ( started_cluster . minio_restricted_bucket ) :
minio_client . remove_bucket ( started_cluster . minio_restricted_bucket )
2019-12-01 11:24:55 +00:00
2021-02-20 14:59:39 +00:00
minio_client . make_bucket ( started_cluster . minio_restricted_bucket )
2019-12-01 11:24:55 +00:00
2019-11-20 11:56:38 +00:00
2021-02-20 14:59:39 +00:00
def put_s3_file_content ( started_cluster , bucket , filename , data ) :
2020-10-02 16:54:07 +00:00
buf = io . BytesIO ( data )
2021-02-20 14:59:39 +00:00
started_cluster . minio_client . put_object ( bucket , filename , buf , len ( data ) )
2020-09-28 23:30:41 +00:00
2019-11-20 11:56:38 +00:00
# Returns content of given S3 file as string.
2021-02-20 14:59:39 +00:00
def get_s3_file_content ( started_cluster , bucket , filename , decode = True ) :
2021-06-02 15:08:16 +00:00
# type: (ClickHouseCluster, str, str, bool) -> str
2019-11-20 11:56:38 +00:00
2021-02-20 14:59:39 +00:00
data = started_cluster . minio_client . get_object ( bucket , filename )
2020-10-02 16:54:07 +00:00
data_str = b " "
2019-11-20 11:56:38 +00:00
for chunk in data . stream ( ) :
data_str + = chunk
2020-10-02 16:54:07 +00:00
if decode :
return data_str . decode ( )
2019-11-20 11:56:38 +00:00
return data_str
2019-09-19 09:34:33 +00:00
2019-06-26 00:41:14 +00:00
@pytest.fixture ( scope = " module " )
2021-02-20 14:59:39 +00:00
def started_cluster ( ) :
2019-06-26 00:41:14 +00:00
try :
cluster = ClickHouseCluster ( __file__ )
2020-06-01 17:16:09 +00:00
cluster . add_instance ( " restricted_dummy " , main_configs = [ " configs/config_for_test_remote_host_filter.xml " ] ,
with_minio = True )
cluster . add_instance ( " dummy " , with_minio = True , main_configs = [ " configs/defaultS3.xml " ] )
2021-03-04 15:56:55 +00:00
cluster . add_instance ( " s3_max_redirects " , with_minio = True , main_configs = [ " configs/defaultS3.xml " ] ,
user_configs = [ " configs/s3_max_redirects.xml " ] )
2019-11-20 11:56:38 +00:00
logging . info ( " Starting cluster... " )
2019-06-26 00:41:14 +00:00
cluster . start ( )
2019-11-20 11:56:38 +00:00
logging . info ( " Cluster started " )
2019-09-19 09:34:33 +00:00
2019-11-20 11:56:38 +00:00
prepare_s3_bucket ( cluster )
logging . info ( " S3 bucket created " )
2021-04-12 08:55:54 +00:00
run_s3_mocks ( cluster )
2019-09-19 09:34:33 +00:00
2019-06-26 00:41:14 +00:00
yield cluster
finally :
cluster . shutdown ( )
2019-09-24 10:58:42 +00:00
def run_query ( instance , query , stdin = None , settings = None ) :
2019-11-20 11:56:38 +00:00
# type: (ClickHouseInstance, str, object, dict) -> str
2019-09-22 10:42:47 +00:00
logging . info ( " Running query ' {} ' ... " . format ( query ) )
2019-09-24 10:58:42 +00:00
result = instance . query ( query , stdin = stdin , settings = settings )
2019-09-22 10:42:47 +00:00
logging . info ( " Query finished " )
2019-06-26 00:41:14 +00:00
2019-11-20 11:56:38 +00:00
return result
2019-09-22 10:42:47 +00:00
2021-03-02 16:53:03 +00:00
# Test simple put. Also checks that wrong credentials produce an error with every compression method.
2021-03-02 02:43:19 +00:00
@pytest.mark.parametrize ( " maybe_auth,positive,compression " , [
2021-04-12 07:03:12 +00:00
pytest . param ( " " , True , ' auto ' , id = " positive " ) ,
pytest . param ( " ' minio ' , ' minio123 ' , " , True , ' auto ' , id = " auth_positive " ) ,
2021-04-29 14:26:41 +00:00
pytest . param ( " ' wrongid ' , ' wrongkey ' , " , False , ' auto ' , id = " auto " ) ,
pytest . param ( " ' wrongid ' , ' wrongkey ' , " , False , ' gzip ' , id = " gzip " ) ,
pytest . param ( " ' wrongid ' , ' wrongkey ' , " , False , ' deflate ' , id = " deflate " ) ,
pytest . param ( " ' wrongid ' , ' wrongkey ' , " , False , ' brotli ' , id = " brotli " ) ,
pytest . param ( " ' wrongid ' , ' wrongkey ' , " , False , ' xz ' , id = " xz " ) ,
pytest . param ( " ' wrongid ' , ' wrongkey ' , " , False , ' zstd ' , id = " zstd " )
2019-12-01 11:24:55 +00:00
] )
2021-04-08 09:30:24 +00:00
def test_put ( started_cluster , maybe_auth , positive , compression ) :
2019-11-20 11:56:38 +00:00
# type: (ClickHouseCluster) -> None
2019-09-19 09:34:33 +00:00
2021-02-20 14:59:39 +00:00
bucket = started_cluster . minio_bucket if not maybe_auth else started_cluster . minio_restricted_bucket
instance = started_cluster . instances [ " dummy " ] # type: ClickHouseInstance
2019-11-20 11:56:38 +00:00
table_format = " column1 UInt32, column2 UInt32, column3 UInt32 "
2019-09-22 10:42:47 +00:00
values = " (1, 2, 3), (3, 2, 1), (78, 43, 45) "
2019-11-20 11:56:38 +00:00
values_csv = " 1,2,3 \n 3,2,1 \n 78,43,45 \n "
filename = " test.csv "
2021-04-14 11:21:40 +00:00
put_query = f """ insert into table function s3( ' http:// { started_cluster . minio_ip } : { started_cluster . minio_port } / { bucket } / { filename } ' ,
2021-03-02 02:43:19 +00:00
{ maybe_auth } ' CSV ' , ' {table_format} ' , { compression } ) values { values } """
2019-11-20 11:56:38 +00:00
2019-12-01 11:24:55 +00:00
try :
run_query ( instance , put_query )
except helpers . client . QueryRuntimeException :
2019-12-03 16:23:24 +00:00
if positive :
raise
2019-12-01 11:24:55 +00:00
else :
assert positive
2021-02-20 14:59:39 +00:00
assert values_csv == get_s3_file_content ( started_cluster , bucket , filename )
2019-11-20 11:56:38 +00:00
2021-05-04 06:25:33 +00:00
@pytest.mark.parametrize ( " special " , [
" space " ,
" plus "
] )
2021-05-12 07:03:53 +00:00
def test_get_file_with_special ( started_cluster , special ) :
2021-05-04 06:25:33 +00:00
symbol = { " space " : " " , " plus " : " + " } [ special ]
urlsafe_symbol = { " space " : " % 20 " , " plus " : " % 2B " } [ special ]
auth = " ' minio ' , ' minio123 ' , "
2021-05-12 07:03:53 +00:00
bucket = started_cluster . minio_restricted_bucket
instance = started_cluster . instances [ " dummy " ]
2021-05-04 06:25:33 +00:00
table_format = " column1 UInt32, column2 UInt32, column3 UInt32 "
values = [ [ 12549 , 2463 , 19893 ] , [ 64021 , 38652 , 66703 ] , [ 81611 , 39650 , 83516 ] , [ 11079 , 59507 , 61546 ] , [ 51764 , 69952 , 6876 ] , [ 41165 , 90293 , 29095 ] , [ 40167 , 78432 , 48309 ] , [ 81629 , 81327 , 11855 ] , [ 55852 , 21643 , 98507 ] , [ 6738 , 54643 , 41155 ] ]
values_csv = ( ' \n ' . join ( ( ' , ' . join ( map ( str , row ) ) for row in values ) ) + ' \n ' ) . encode ( )
filename = f " get_file_with_ { special } _ { symbol } two.csv "
2021-05-12 07:03:53 +00:00
put_s3_file_content ( started_cluster , bucket , filename , values_csv )
2021-05-04 06:25:33 +00:00
2021-05-12 07:03:53 +00:00
get_query = f " SELECT * FROM s3( ' http:// { started_cluster . minio_host } : { started_cluster . minio_port } / { bucket } /get_file_with_ { special } _ { urlsafe_symbol } two.csv ' , { auth } ' CSV ' , ' { table_format } ' ) FORMAT TSV "
2021-05-04 06:25:33 +00:00
assert [ list ( map ( int , l . split ( ) ) ) for l in run_query ( instance , get_query ) . splitlines ( ) ] == values
2021-05-12 07:03:53 +00:00
get_query = f " SELECT * FROM s3( ' http:// { started_cluster . minio_host } : { started_cluster . minio_port } / { bucket } /get_file_with_ { special } *.csv ' , { auth } ' CSV ' , ' { table_format } ' ) FORMAT TSV "
2021-05-04 06:25:33 +00:00
assert [ list ( map ( int , l . split ( ) ) ) for l in run_query ( instance , get_query ) . splitlines ( ) ] == values
2021-05-12 07:03:53 +00:00
get_query = f " SELECT * FROM s3( ' http:// { started_cluster . minio_host } : { started_cluster . minio_port } / { bucket } /get_file_with_ { special } _ { urlsafe_symbol } *.csv ' , { auth } ' CSV ' , ' { table_format } ' ) FORMAT TSV "
2021-05-04 06:25:33 +00:00
assert [ list ( map ( int , l . split ( ) ) ) for l in run_query ( instance , get_query ) . splitlines ( ) ] == values
@pytest.mark.parametrize ( " special " , [
" space " ,
" plus " ,
" plus2 "
] )
2021-05-12 07:03:53 +00:00
def test_get_path_with_special ( started_cluster , special ) :
2021-05-04 06:25:33 +00:00
symbol = { " space " : " % 20 " , " plus " : " % 2B " , " plus2 " : " % 2B " } [ special ]
safe_symbol = { " space " : " % 20 " , " plus " : " + " , " plus2 " : " % 2B " } [ special ]
auth = " ' minio ' , ' minio123 ' , "
table_format = " column1 String "
2021-05-12 07:03:53 +00:00
instance = started_cluster . instances [ " dummy " ]
2021-05-04 06:25:33 +00:00
get_query = f " SELECT * FROM s3( ' http://resolver:8082/get-my-path/ { safe_symbol } .csv ' , { auth } ' CSV ' , ' { table_format } ' ) FORMAT TSV "
assert run_query ( instance , get_query ) . splitlines ( ) == [ f " / { symbol } .csv " ]
2020-11-11 12:15:16 +00:00
# Test put no data to S3.
@pytest.mark.parametrize ( " auth " , [
2021-04-13 10:52:22 +00:00
pytest . param ( " ' minio ' , ' minio123 ' , " , id = " minio " )
2020-11-11 12:15:16 +00:00
] )
2021-02-20 14:59:39 +00:00
def test_empty_put ( started_cluster , auth ) :
2021-06-02 15:08:16 +00:00
# type: (ClickHouseCluster, str) -> None
2020-11-11 12:15:16 +00:00
2021-02-20 14:59:39 +00:00
bucket = started_cluster . minio_bucket
instance = started_cluster . instances [ " dummy " ] # type: ClickHouseInstance
2020-11-11 12:15:16 +00:00
table_format = " column1 UInt32, column2 UInt32, column3 UInt32 "
create_empty_table_query = """
CREATE TABLE empty_table (
{ }
) ENGINE = Null ( )
""" .format(table_format)
run_query ( instance , create_empty_table_query )
filename = " empty_put_test.csv "
put_query = " insert into table function s3( ' http:// {} : {} / {} / {} ' , {} ' CSV ' , ' {} ' ) select * from empty_table " . format (
2021-04-14 11:21:40 +00:00
started_cluster . minio_ip , MINIO_INTERNAL_PORT , bucket , filename , auth , table_format )
2020-11-11 12:15:16 +00:00
run_query ( instance , put_query )
try :
run_query ( instance , " select count(*) from s3( ' http:// {} : {} / {} / {} ' , {} ' CSV ' , ' {} ' ) " . format (
2021-04-14 11:21:40 +00:00
started_cluster . minio_ip , MINIO_INTERNAL_PORT , bucket , filename , auth , table_format ) )
2020-11-11 12:15:16 +00:00
assert False , " Query should be failed. "
except helpers . client . QueryRuntimeException as e :
assert str ( e ) . find ( " The specified key does not exist " ) != 0
2019-11-20 11:56:38 +00:00
# Test put values in CSV format.
2019-12-01 11:24:55 +00:00
@pytest.mark.parametrize ( " maybe_auth,positive " , [
2021-04-12 07:03:12 +00:00
pytest . param ( " " , True , id = " positive " ) ,
pytest . param ( " ' minio ' , ' minio123 ' , " , True , id = " auth_positive " ) ,
pytest . param ( " ' wrongid ' , ' wrongkey ' , " , False , id = " negative " ) ,
2019-12-01 11:24:55 +00:00
] )
2021-02-20 14:59:39 +00:00
def test_put_csv ( started_cluster , maybe_auth , positive ) :
2021-06-02 15:08:16 +00:00
# type: (ClickHouseCluster, bool, str) -> None
2019-11-20 11:56:38 +00:00
2021-02-20 14:59:39 +00:00
bucket = started_cluster . minio_bucket if not maybe_auth else started_cluster . minio_restricted_bucket
instance = started_cluster . instances [ " dummy " ] # type: ClickHouseInstance
2019-11-20 11:56:38 +00:00
table_format = " column1 UInt32, column2 UInt32, column3 UInt32 "
filename = " test.csv "
2019-12-01 11:24:55 +00:00
put_query = " insert into table function s3( ' http:// {} : {} / {} / {} ' , {} ' CSV ' , ' {} ' ) format CSV " . format (
2021-04-14 11:21:40 +00:00
started_cluster . minio_ip , MINIO_INTERNAL_PORT , bucket , filename , maybe_auth , table_format )
2019-09-22 10:42:47 +00:00
csv_data = " 8,9,16 \n 11,18,13 \n 22,14,2 \n "
2019-11-20 11:56:38 +00:00
2019-12-01 11:24:55 +00:00
try :
run_query ( instance , put_query , stdin = csv_data )
except helpers . client . QueryRuntimeException :
2019-12-03 16:23:24 +00:00
if positive :
raise
2019-12-01 11:24:55 +00:00
else :
assert positive
2021-02-20 14:59:39 +00:00
assert csv_data == get_s3_file_content ( started_cluster , bucket , filename )
2019-11-20 11:56:38 +00:00
# Test put and get with S3 server redirect.
2021-02-20 14:59:39 +00:00
def test_put_get_with_redirect ( started_cluster ) :
2019-11-20 11:56:38 +00:00
# type: (ClickHouseCluster) -> None
2021-02-20 14:59:39 +00:00
bucket = started_cluster . minio_bucket
instance = started_cluster . instances [ " dummy " ] # type: ClickHouseInstance
2019-11-20 11:56:38 +00:00
table_format = " column1 UInt32, column2 UInt32, column3 UInt32 "
values = " (1, 1, 1), (1, 1, 1), (11, 11, 11) "
values_csv = " 1,1,1 \n 1,1,1 \n 11,11,11 \n "
filename = " test.csv "
query = " insert into table function s3( ' http:// {} : {} / {} / {} ' , ' CSV ' , ' {} ' ) values {} " . format (
2021-02-20 14:59:39 +00:00
started_cluster . minio_redirect_host , started_cluster . minio_redirect_port , bucket , filename , table_format , values )
2019-09-19 09:34:33 +00:00
run_query ( instance , query )
2021-02-20 14:59:39 +00:00
assert values_csv == get_s3_file_content ( started_cluster , bucket , filename )
2019-11-20 11:56:38 +00:00
query = " select *, column1*column2*column3 from s3( ' http:// {} : {} / {} / {} ' , ' CSV ' , ' {} ' ) " . format (
2021-02-20 14:59:39 +00:00
started_cluster . minio_redirect_host , started_cluster . minio_redirect_port , bucket , filename , table_format )
2019-09-19 09:34:33 +00:00
stdout = run_query ( instance , query )
2019-11-20 11:56:38 +00:00
2019-09-19 09:34:33 +00:00
assert list ( map ( str . split , stdout . splitlines ( ) ) ) == [
2019-09-22 10:42:47 +00:00
[ " 1 " , " 1 " , " 1 " , " 1 " ] ,
[ " 1 " , " 1 " , " 1 " , " 1 " ] ,
[ " 11 " , " 11 " , " 11 " , " 1331 " ] ,
2019-06-26 00:41:14 +00:00
]
2019-11-20 11:56:38 +00:00
2020-11-23 11:02:17 +00:00
# Test put with restricted S3 server redirect.
2021-02-20 14:59:39 +00:00
def test_put_with_zero_redirect ( started_cluster ) :
2020-11-20 08:18:44 +00:00
# type: (ClickHouseCluster) -> None
2021-02-20 14:59:39 +00:00
bucket = started_cluster . minio_bucket
instance = started_cluster . instances [ " s3_max_redirects " ] # type: ClickHouseInstance
2020-11-20 08:18:44 +00:00
table_format = " column1 UInt32, column2 UInt32, column3 UInt32 "
values = " (1, 1, 1), (1, 1, 1), (11, 11, 11) "
filename = " test.csv "
# Should work without redirect
query = " insert into table function s3( ' http:// {} : {} / {} / {} ' , ' CSV ' , ' {} ' ) values {} " . format (
2021-04-14 11:21:40 +00:00
started_cluster . minio_ip , MINIO_INTERNAL_PORT , bucket , filename , table_format , values )
2020-11-20 08:18:44 +00:00
run_query ( instance , query )
# Should not work with redirect
query = " insert into table function s3( ' http:// {} : {} / {} / {} ' , ' CSV ' , ' {} ' ) values {} " . format (
2021-02-20 14:59:39 +00:00
started_cluster . minio_redirect_host , started_cluster . minio_redirect_port , bucket , filename , table_format , values )
2020-11-20 08:18:44 +00:00
exception_raised = False
try :
run_query ( instance , query )
except Exception as e :
assert str ( e ) . find ( " Too many redirects while trying to access " ) != - 1
exception_raised = True
finally :
assert exception_raised
2021-02-20 14:59:39 +00:00
def test_put_get_with_globs ( started_cluster ) :
2020-01-27 21:44:18 +00:00
# type: (ClickHouseCluster) -> None
2021-02-20 14:59:39 +00:00
bucket = started_cluster . minio_bucket
instance = started_cluster . instances [ " dummy " ] # type: ClickHouseInstance
2020-01-27 21:44:18 +00:00
table_format = " column1 UInt32, column2 UInt32, column3 UInt32 "
2020-01-27 22:09:21 +00:00
max_path = " "
2020-01-27 21:44:18 +00:00
for i in range ( 10 ) :
for j in range ( 10 ) :
path = " {} _ {} / {} .csv " . format ( i , random . choice ( [ ' a ' , ' b ' , ' c ' , ' d ' ] ) , j )
2020-01-27 22:09:21 +00:00
max_path = max ( path , max_path )
2020-06-01 17:16:09 +00:00
values = " ( {} , {} , {} ) " . format ( i , j , i + j )
2020-01-27 21:44:18 +00:00
query = " insert into table function s3( ' http:// {} : {} / {} / {} ' , ' CSV ' , ' {} ' ) values {} " . format (
2021-04-14 11:21:40 +00:00
started_cluster . minio_ip , MINIO_INTERNAL_PORT , bucket , path , table_format , values )
2020-01-27 21:44:18 +00:00
run_query ( instance , query )
2020-01-27 22:09:21 +00:00
query = " select sum(column1), sum(column2), sum(column3), min(_file), max(_path) from s3( ' http:// {} : {} / {} /*_ {{ a,b,c,d}}/ %3f .csv ' , ' CSV ' , ' {} ' ) " . format (
2021-02-20 14:59:39 +00:00
started_cluster . minio_redirect_host , started_cluster . minio_redirect_port , bucket , table_format )
2020-06-01 17:16:09 +00:00
assert run_query ( instance , query ) . splitlines ( ) == [
" 450 \t 450 \t 900 \t 0.csv \t {bucket} / {max_path} " . format ( bucket = bucket , max_path = max_path ) ]
2020-01-27 21:44:18 +00:00
2019-11-20 11:56:38 +00:00
# Test multipart put.
2019-12-01 11:24:55 +00:00
@pytest.mark.parametrize ( " maybe_auth,positive " , [
2021-04-12 07:03:12 +00:00
pytest . param ( " " , True , id = " positive " ) ,
pytest . param ( " ' wrongid ' , ' wrongkey ' " , False , id = " negative " ) ,
2019-12-03 16:23:24 +00:00
# ("'minio','minio123',",True), Redirect with credentials not working with nginx.
2019-12-01 11:24:55 +00:00
] )
2021-02-20 14:59:39 +00:00
def test_multipart_put ( started_cluster , maybe_auth , positive ) :
2019-11-20 11:56:38 +00:00
# type: (ClickHouseCluster) -> None
2021-02-20 14:59:39 +00:00
bucket = started_cluster . minio_bucket if not maybe_auth else started_cluster . minio_restricted_bucket
instance = started_cluster . instances [ " dummy " ] # type: ClickHouseInstance
2019-11-20 11:56:38 +00:00
table_format = " column1 UInt32, column2 UInt32, column3 UInt32 "
# Minimum size of part is 5 Mb for Minio.
# See: https://github.com/minio/minio/blob/master/docs/minio-limits.md
2019-11-21 13:13:38 +00:00
min_part_size_bytes = 5 * 1024 * 1024
csv_size_bytes = int ( min_part_size_bytes * 1.5 ) # To have 2 parts.
one_line_length = 6 # 3 digits, 2 commas, 1 line separator.
# Generate data having size more than one part
2020-10-02 16:54:07 +00:00
int_data = [ [ 1 , 2 , 3 ] for i in range ( csv_size_bytes / / one_line_length ) ]
2019-11-21 13:13:38 +00:00
csv_data = " " . join ( [ " {} , {} , {} \n " . format ( x , y , z ) for x , y , z in int_data ] )
2019-11-20 11:56:38 +00:00
2019-11-21 13:13:38 +00:00
assert len ( csv_data ) > min_part_size_bytes
filename = " test_multipart.csv "
2019-12-01 11:24:55 +00:00
put_query = " insert into table function s3( ' http:// {} : {} / {} / {} ' , {} ' CSV ' , ' {} ' ) format CSV " . format (
2021-02-20 14:59:39 +00:00
started_cluster . minio_redirect_host , started_cluster . minio_redirect_port , bucket , filename , maybe_auth , table_format )
2019-11-21 13:13:38 +00:00
2019-12-01 11:24:55 +00:00
try :
2020-12-09 14:09:04 +00:00
run_query ( instance , put_query , stdin = csv_data , settings = { ' s3_min_upload_part_size ' : min_part_size_bytes ,
' s3_max_single_part_upload_size ' : 0 } )
2019-12-01 11:24:55 +00:00
except helpers . client . QueryRuntimeException :
2019-12-03 16:23:24 +00:00
if positive :
raise
2019-12-01 11:24:55 +00:00
else :
assert positive
2020-07-10 19:42:18 +00:00
# Use proxy access logs to count number of parts uploaded to Minio.
2021-02-20 14:59:39 +00:00
proxy_logs = started_cluster . get_container_logs ( " proxy1 " ) # type: str
2020-07-10 19:42:18 +00:00
assert proxy_logs . count ( " PUT / {} / {} " . format ( bucket , filename ) ) > = 2
2019-12-01 11:24:55 +00:00
2021-02-20 14:59:39 +00:00
assert csv_data == get_s3_file_content ( started_cluster , bucket , filename )
2019-11-06 17:06:50 +00:00
2019-12-03 17:36:02 +00:00
2021-02-20 14:59:39 +00:00
def test_remote_host_filter ( started_cluster ) :
instance = started_cluster . instances [ " restricted_dummy " ]
2019-11-06 17:06:50 +00:00
format = " column1 UInt32, column2 UInt32, column3 UInt32 "
2019-12-09 12:05:16 +00:00
query = " select *, column1*column2*column3 from s3( ' http:// {} : {} / {} /test.csv ' , ' CSV ' , ' {} ' ) " . format (
2021-02-20 14:59:39 +00:00
" invalid_host " , MINIO_INTERNAL_PORT , started_cluster . minio_bucket , format )
2019-11-06 17:06:50 +00:00
assert " not allowed in config.xml " in instance . query_and_get_error ( query )
other_values = " (1, 1, 1), (1, 1, 1), (11, 11, 11) "
2019-12-09 12:05:16 +00:00
query = " insert into table function s3( ' http:// {} : {} / {} /test.csv ' , ' CSV ' , ' {} ' ) values {} " . format (
2021-02-20 14:59:39 +00:00
" invalid_host " , MINIO_INTERNAL_PORT , started_cluster . minio_bucket , format , other_values )
2019-11-06 17:06:50 +00:00
assert " not allowed in config.xml " in instance . query_and_get_error ( query )
2019-12-10 16:11:13 +00:00
@pytest.mark.parametrize ( " s3_storage_args " , [
2021-04-12 07:03:12 +00:00
pytest . param ( " ' ' " , id = " 1_argument " ) ,
pytest . param ( " ' ' , ' ' , ' ' , ' ' , ' ' , ' ' " , id = " 6_arguments " ) ,
2019-12-10 16:11:13 +00:00
] )
2021-02-20 14:59:39 +00:00
def test_wrong_s3_syntax ( started_cluster , s3_storage_args ) :
instance = started_cluster . instances [ " dummy " ] # type: ClickHouseInstance
2019-12-10 16:11:13 +00:00
expected_err_msg = " Code: 42 " # NUMBER_OF_ARGUMENTS_DOESNT_MATCH
query = " create table test_table_s3_syntax (id UInt32) ENGINE = S3( {} ) " . format ( s3_storage_args )
assert expected_err_msg in instance . query_and_get_error ( query )
2020-05-25 09:15:11 +00:00
2020-05-25 21:05:15 +00:00
# https://en.wikipedia.org/wiki/One_Thousand_and_One_Nights
2021-02-20 14:59:39 +00:00
def test_s3_glob_scheherazade ( started_cluster ) :
bucket = started_cluster . minio_bucket
instance = started_cluster . instances [ " dummy " ] # type: ClickHouseInstance
2020-05-25 09:15:11 +00:00
table_format = " column1 UInt32, column2 UInt32, column3 UInt32 "
max_path = " "
values = " (1, 1, 1) "
nights_per_job = 1001 / / 30
jobs = [ ]
for night in range ( 0 , 1001 , nights_per_job ) :
def add_tales ( start , end ) :
for i in range ( start , end ) :
path = " night_ {} /tale.csv " . format ( i )
query = " insert into table function s3( ' http:// {} : {} / {} / {} ' , ' CSV ' , ' {} ' ) values {} " . format (
2021-04-14 11:21:40 +00:00
started_cluster . minio_ip , MINIO_INTERNAL_PORT , bucket , path , table_format , values )
2020-05-25 09:15:11 +00:00
run_query ( instance , query )
2020-09-16 04:26:10 +00:00
jobs . append ( threading . Thread ( target = add_tales , args = ( night , min ( night + nights_per_job , 1001 ) ) ) )
2020-05-25 09:15:11 +00:00
jobs [ - 1 ] . start ( )
for job in jobs :
job . join ( )
query = " select count(), sum(column1), sum(column2), sum(column3) from s3( ' http:// {} : {} / {} /night_*/tale.csv ' , ' CSV ' , ' {} ' ) " . format (
2021-02-20 14:59:39 +00:00
started_cluster . minio_redirect_host , started_cluster . minio_redirect_port , bucket , table_format )
2020-05-25 09:15:11 +00:00
assert run_query ( instance , query ) . splitlines ( ) == [ " 1001 \t 1001 \t 1001 \t 1001 " ]
2020-06-01 17:16:09 +00:00
2021-04-27 15:34:33 +00:00
def run_s3_mocks ( started_cluster ) :
2021-04-12 08:55:54 +00:00
logging . info ( " Starting s3 mocks " )
mocks = (
( " mock_s3.py " , " resolver " , " 8080 " ) ,
( " unstable_server.py " , " resolver " , " 8081 " ) ,
2021-05-04 06:25:33 +00:00
( " echo.py " , " resolver " , " 8082 " ) ,
2021-04-12 08:55:54 +00:00
)
for mock_filename , container , port in mocks :
2021-04-27 15:34:33 +00:00
container_id = started_cluster . get_container_id ( container )
2021-04-12 08:55:54 +00:00
current_dir = os . path . dirname ( __file__ )
2021-04-27 15:34:33 +00:00
started_cluster . copy_file_to_container ( container_id , os . path . join ( current_dir , " s3_mocks " , mock_filename ) , mock_filename )
started_cluster . exec_in_container ( container_id , [ " python " , mock_filename , port ] , detach = True )
2021-04-12 08:55:54 +00:00
# Wait for S3 mocks to start
for mock_filename , container , port in mocks :
2021-06-06 09:38:49 +00:00
num_attempts = 100
for attempt in range ( num_attempts ) :
2021-04-27 15:34:33 +00:00
ping_response = started_cluster . exec_in_container ( started_cluster . get_container_id ( container ) ,
2021-06-01 14:18:35 +00:00
[ " curl " , " -s " , f " http://localhost: { port } / " ] , nothrow = True )
2021-04-12 08:55:54 +00:00
if ping_response != ' OK ' :
2021-06-06 09:38:49 +00:00
if attempt == num_attempts - 1 :
2021-04-12 08:55:54 +00:00
assert ping_response == ' OK ' , ' Expected " OK " , but got " {} " ' . format ( ping_response )
else :
time . sleep ( 1 )
2020-11-23 10:19:43 +00:00
else :
2021-05-12 07:03:53 +00:00
logging . debug ( f " mock { mock_filename } ( { port } ) answered { ping_response } on attempt { attempt } " )
2021-04-12 08:55:54 +00:00
break
2020-11-23 10:19:43 +00:00
2021-04-12 08:55:54 +00:00
logging . info ( " S3 mocks started " )
2020-06-01 17:16:09 +00:00
2021-03-04 15:56:55 +00:00
def replace_config ( old , new ) :
config = open ( CONFIG_PATH , ' r ' )
config_lines = config . readlines ( )
config . close ( )
config_lines = [ line . replace ( old , new ) for line in config_lines ]
config = open ( CONFIG_PATH , ' w ' )
config . writelines ( config_lines )
config . close ( )
2021-02-20 14:59:39 +00:00
def test_custom_auth_headers ( started_cluster ) :
2020-06-01 17:16:09 +00:00
table_format = " column1 UInt32, column2 UInt32, column3 UInt32 "
filename = " test.csv "
get_query = " select * from s3( ' http://resolver:8080/ {bucket} / {file} ' , ' CSV ' , ' {table_format} ' ) " . format (
2021-02-20 14:59:39 +00:00
bucket = started_cluster . minio_restricted_bucket ,
2020-06-01 17:16:09 +00:00
file = filename ,
table_format = table_format )
2021-02-20 14:59:39 +00:00
instance = started_cluster . instances [ " dummy " ] # type: ClickHouseInstance
2020-06-01 17:16:09 +00:00
result = run_query ( instance , get_query )
assert result == ' 1 \t 2 \t 3 \n '
2020-07-07 13:20:48 +00:00
2021-03-04 15:56:55 +00:00
instance . query (
" CREATE TABLE test ( {table_format} ) ENGINE = S3( ' http://resolver:8080/ {bucket} / {file} ' , ' CSV ' ) " . format (
2021-03-26 18:46:42 +00:00
bucket = started_cluster . minio_restricted_bucket ,
2021-03-04 15:56:55 +00:00
file = filename ,
table_format = table_format
) )
assert run_query ( instance , " SELECT * FROM test " ) == ' 1 \t 2 \t 3 \n '
replace_config ( " <header>Authorization: Bearer TOKEN " , " <header>Authorization: Bearer INVALID_TOKEN " )
instance . query ( " SYSTEM RELOAD CONFIG " )
ret , err = instance . query_and_get_answer_with_error ( " SELECT * FROM test " )
assert ret == " " and err != " "
replace_config ( " <header>Authorization: Bearer INVALID_TOKEN " , " <header>Authorization: Bearer TOKEN " )
instance . query ( " SYSTEM RELOAD CONFIG " )
assert run_query ( instance , " SELECT * FROM test " ) == ' 1 \t 2 \t 3 \n '
2020-07-07 13:20:48 +00:00
2021-02-20 14:59:39 +00:00
def test_custom_auth_headers_exclusion ( started_cluster ) :
2021-01-07 03:42:39 +00:00
table_format = " column1 UInt32, column2 UInt32, column3 UInt32 "
filename = " test.csv "
2021-02-20 14:59:39 +00:00
get_query = f " SELECT * FROM s3( ' http://resolver:8080/ { started_cluster . minio_restricted_bucket } /restricteddirectory/ { filename } ' , ' CSV ' , ' { table_format } ' ) "
2021-01-07 03:42:39 +00:00
2021-02-20 14:59:39 +00:00
instance = started_cluster . instances [ " dummy " ] # type: ClickHouseInstance
2021-01-07 03:42:39 +00:00
with pytest . raises ( helpers . client . QueryRuntimeException ) as ei :
result = run_query ( instance , get_query )
print ( result )
assert ei . value . returncode == 243
2021-05-02 10:55:24 +00:00
assert ' Forbidden Error ' in ei . value . stderr
2021-01-07 03:42:39 +00:00
2021-04-29 11:57:48 +00:00
def test_infinite_redirect ( started_cluster ) :
bucket = " redirected "
table_format = " column1 UInt32, column2 UInt32, column3 UInt32 "
filename = " test.csv "
get_query = f " select * from s3( ' http://resolver: { started_cluster . minio_redirect_port } / { bucket } / { filename } ' , ' CSV ' , ' { table_format } ' ) "
instance = started_cluster . instances [ " dummy " ] # type: ClickHouseInstance
exception_raised = False
try :
run_query ( instance , get_query )
except Exception as e :
assert str ( e ) . find ( " Too many redirects while trying to access " ) != - 1
exception_raised = True
finally :
assert exception_raised
2021-01-29 04:54:52 +00:00
@pytest.mark.parametrize ( " extension,method " , [
2021-04-12 07:03:12 +00:00
pytest . param ( " bin " , " gzip " , id = " bin " ) ,
pytest . param ( " gz " , " auto " , id = " gz " ) ,
2021-01-29 04:54:52 +00:00
] )
2021-02-20 14:59:39 +00:00
def test_storage_s3_get_gzip ( started_cluster , extension , method ) :
bucket = started_cluster . minio_bucket
instance = started_cluster . instances [ " dummy " ]
2021-01-29 04:54:52 +00:00
filename = f " test_get_gzip. { extension } "
2021-03-03 08:56:15 +00:00
name = f " test_get_gzip_ { extension } "
2020-09-28 23:30:41 +00:00
data = [
" Sophia Intrieri,55 " ,
" Jack Taylor,71 " ,
" Christopher Silva,66 " ,
" Clifton Purser,35 " ,
" Richard Aceuedo,43 " ,
" Lisa Hensley,31 " ,
" Alice Wehrley,1 " ,
" Mary Farmer,47 " ,
" Samara Ramirez,19 " ,
" Shirley Lloyd,51 " ,
" Santos Cowger,0 " ,
" Richard Mundt,88 " ,
" Jerry Gonzalez,15 " ,
" Angela James,10 " ,
" Norman Ortega,33 " ,
" "
]
2020-10-02 16:54:07 +00:00
buf = io . BytesIO ( )
2020-09-28 23:30:41 +00:00
compressed = gzip . GzipFile ( fileobj = buf , mode = " wb " )
2020-10-02 16:54:07 +00:00
compressed . write ( ( " \n " . join ( data ) ) . encode ( ) )
2020-09-28 23:30:41 +00:00
compressed . close ( )
2021-02-20 14:59:39 +00:00
put_s3_file_content ( started_cluster , bucket , filename , buf . getvalue ( ) )
2020-09-28 23:30:41 +00:00
2021-03-03 08:56:15 +00:00
run_query ( instance , f """ CREATE TABLE { name } (name String, id UInt32) ENGINE = S3(
2021-04-14 11:21:40 +00:00
' http:// {started_cluster.minio_ip} : {MINIO_INTERNAL_PORT} / {bucket} / {filename} ' ,
2021-03-03 08:56:15 +00:00
' CSV ' ,
' {method} ' ) """ )
2020-09-28 23:30:41 +00:00
2021-03-03 08:56:15 +00:00
run_query ( instance , " SELECT sum(id) FROM {} " . format ( name ) ) . splitlines ( ) == [ " 565 " ]
2020-09-28 23:30:41 +00:00
2021-04-27 15:34:33 +00:00
def test_storage_s3_get_unstable ( started_cluster ) :
bucket = started_cluster . minio_bucket
instance = started_cluster . instances [ " dummy " ]
2021-04-12 21:38:45 +00:00
table_format = " column1 Int64, column2 Int64, column3 Int64, column4 Int64 "
2021-05-08 21:55:24 +00:00
get_query = f " SELECT count(), sum(column3), sum(column4) FROM s3( ' http://resolver:8081/ { started_cluster . minio_bucket } /test.csv ' , ' CSV ' , ' { table_format } ' ) FORMAT CSV "
2021-04-12 08:55:54 +00:00
result = run_query ( instance , get_query )
2021-05-08 21:55:24 +00:00
assert result . splitlines ( ) == [ " 500001,500000,0 " ]
2021-04-12 08:55:54 +00:00
2021-02-20 14:59:39 +00:00
def test_storage_s3_put_uncompressed ( started_cluster ) :
bucket = started_cluster . minio_bucket
instance = started_cluster . instances [ " dummy " ]
2020-09-30 12:04:21 +00:00
filename = " test_put_uncompressed.bin "
name = " test_put_uncompressed "
data = [
" ' Gloria Thompson ' ,99 " ,
" ' Matthew Tang ' ,98 " ,
" ' Patsy Anderson ' ,23 " ,
" ' Nancy Badillo ' ,93 " ,
" ' Roy Hunt ' ,5 " ,
" ' Adam Kirk ' ,51 " ,
" ' Joshua Douds ' ,28 " ,
" ' Jolene Ryan ' ,0 " ,
" ' Roxanne Padilla ' ,50 " ,
" ' Howard Roberts ' ,41 " ,
" ' Ricardo Broughton ' ,13 " ,
" ' Roland Speer ' ,83 " ,
" ' Cathy Cohan ' ,58 " ,
" ' Kathie Dawson ' ,100 " ,
" ' Gregg Mcquistion ' ,11 " ,
]
2021-03-03 08:56:15 +00:00
run_query ( instance , " CREATE TABLE {} (name String, id UInt32) ENGINE = S3( ' http:// {} : {} / {} / {} ' , ' CSV ' ) " . format (
2021-04-14 11:21:40 +00:00
name , started_cluster . minio_ip , MINIO_INTERNAL_PORT , bucket , filename ) )
2020-09-30 12:04:21 +00:00
2021-03-03 08:56:15 +00:00
run_query ( instance , " INSERT INTO {} VALUES ( {} ) " . format ( name , " ),( " . join ( data ) ) )
2020-09-30 13:09:55 +00:00
2021-03-03 08:56:15 +00:00
run_query ( instance , " SELECT sum(id) FROM {} " . format ( name ) ) . splitlines ( ) == [ " 753 " ]
2020-09-30 12:04:21 +00:00
2021-03-03 08:56:15 +00:00
uncompressed_content = get_s3_file_content ( started_cluster , bucket , filename )
assert sum ( [ int ( i . split ( ' , ' ) [ 1 ] ) for i in uncompressed_content . splitlines ( ) ] ) == 753
2020-09-30 12:04:21 +00:00
2021-01-29 04:54:52 +00:00
@pytest.mark.parametrize ( " extension,method " , [
2021-04-12 07:03:12 +00:00
pytest . param ( " bin " , " gzip " , id = " bin " ) ,
pytest . param ( " gz " , " auto " , id = " gz " )
2021-01-29 04:54:52 +00:00
] )
2021-02-20 14:59:39 +00:00
def test_storage_s3_put_gzip ( started_cluster , extension , method ) :
bucket = started_cluster . minio_bucket
instance = started_cluster . instances [ " dummy " ]
2021-01-29 04:54:52 +00:00
filename = f " test_put_gzip. { extension } "
2021-03-03 08:56:15 +00:00
name = f " test_put_gzip_ { extension } "
2020-09-28 23:30:41 +00:00
data = [
" ' Joseph Tomlinson ' ,5 " ,
" ' Earnest Essary ' ,44 " ,
" ' Matha Pannell ' ,24 " ,
" ' Michael Shavers ' ,46 " ,
" ' Elias Groce ' ,38 " ,
" ' Pamela Bramlet ' ,50 " ,
" ' Lewis Harrell ' ,49 " ,
" ' Tamara Fyall ' ,58 " ,
" ' George Dixon ' ,38 " ,
" ' Alice Walls ' ,49 " ,
" ' Paula Mais ' ,24 " ,
" ' Myrtle Pelt ' ,93 " ,
" ' Sylvia Naffziger ' ,18 " ,
" ' Amanda Cave ' ,83 " ,
" ' Yolanda Joseph ' ,89 "
]
2021-03-03 08:56:15 +00:00
run_query ( instance , f """ CREATE TABLE { name } (name String, id UInt32) ENGINE = S3(
2021-04-14 11:21:40 +00:00
' http:// {started_cluster.minio_ip} : {MINIO_INTERNAL_PORT} / {bucket} / {filename} ' ,
2021-03-03 08:56:15 +00:00
' CSV ' ,
' {method} ' ) """ )
2020-09-28 23:30:41 +00:00
2021-03-03 08:56:15 +00:00
run_query ( instance , f " INSERT INTO { name } VALUES ( { ' ),( ' . join ( data ) } ) " )
2020-09-30 13:09:55 +00:00
2021-03-03 08:56:15 +00:00
run_query ( instance , f " SELECT sum(id) FROM { name } " ) . splitlines ( ) == [ " 708 " ]
2020-09-28 23:30:41 +00:00
2021-03-03 08:56:15 +00:00
buf = io . BytesIO ( get_s3_file_content ( started_cluster , bucket , filename , decode = False ) )
f = gzip . GzipFile ( fileobj = buf , mode = " rb " )
uncompressed_content = f . read ( ) . decode ( )
2021-06-06 09:38:49 +00:00
assert sum ( [ int ( i . split ( ' , ' ) [ 1 ] ) for i in uncompressed_content . splitlines ( ) ] ) == 708