ClickHouse/tests/integration/test_storage_s3/test.py

347 lines
14 KiB
Python
Raw Normal View History

2019-09-19 09:34:33 +00:00
import json
import logging
import os
2020-01-27 21:44:18 +00:00
import random
2020-05-25 09:15:11 +00:00
import threading
2019-09-19 09:34:33 +00:00
import helpers.client
2019-06-26 00:41:14 +00:00
import pytest
from helpers.cluster import ClickHouseCluster, ClickHouseInstance
2019-09-19 09:34:33 +00:00
logging.getLogger().setLevel(logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler())
# Creates S3 bucket for tests and allows anonymous read-write access to it.
def prepare_s3_bucket(cluster):
# Allows read-write access for bucket without authorization.
bucket_read_write_policy = {"Version": "2012-10-17",
"Statement": [
{
"Sid": "",
"Effect": "Allow",
"Principal": {"AWS": "*"},
"Action": "s3:GetBucketLocation",
"Resource": "arn:aws:s3:::root"
},
{
"Sid": "",
"Effect": "Allow",
"Principal": {"AWS": "*"},
"Action": "s3:ListBucket",
"Resource": "arn:aws:s3:::root"
},
{
"Sid": "",
"Effect": "Allow",
"Principal": {"AWS": "*"},
"Action": "s3:GetObject",
"Resource": "arn:aws:s3:::root/*"
},
{
"Sid": "",
"Effect": "Allow",
"Principal": {"AWS": "*"},
"Action": "s3:PutObject",
"Resource": "arn:aws:s3:::root/*"
}
]}
minio_client = cluster.minio_client
minio_client.set_bucket_policy(cluster.minio_bucket, json.dumps(bucket_read_write_policy))
cluster.minio_restricted_bucket = "{}-with-auth".format(cluster.minio_bucket)
if minio_client.bucket_exists(cluster.minio_restricted_bucket):
minio_client.remove_bucket(cluster.minio_restricted_bucket)
minio_client.make_bucket(cluster.minio_restricted_bucket)
# Returns content of given S3 file as string.
def get_s3_file_content(cluster, bucket, filename):
# type: (ClickHouseCluster, str) -> str
data = cluster.minio_client.get_object(bucket, filename)
data_str = ""
for chunk in data.stream():
data_str += chunk
return data_str
2019-09-19 09:34:33 +00:00
2019-06-26 00:41:14 +00:00
@pytest.fixture(scope="module")
def cluster():
2019-06-26 00:41:14 +00:00
try:
cluster = ClickHouseCluster(__file__)
cluster.add_instance("restricted_dummy", main_configs=["configs/config_for_test_remote_host_filter.xml"],
with_minio=True)
cluster.add_instance("dummy", with_minio=True, main_configs=["configs/defaultS3.xml"])
logging.info("Starting cluster...")
2019-06-26 00:41:14 +00:00
cluster.start()
logging.info("Cluster started")
2019-09-19 09:34:33 +00:00
prepare_s3_bucket(cluster)
logging.info("S3 bucket created")
run_s3_mock(cluster)
2019-09-19 09:34:33 +00:00
2019-06-26 00:41:14 +00:00
yield cluster
finally:
cluster.shutdown()
2019-09-24 10:58:42 +00:00
def run_query(instance, query, stdin=None, settings=None):
# type: (ClickHouseInstance, str, object, dict) -> str
2019-09-22 10:42:47 +00:00
logging.info("Running query '{}'...".format(query))
2019-09-24 10:58:42 +00:00
result = instance.query(query, stdin=stdin, settings=settings)
2019-09-22 10:42:47 +00:00
logging.info("Query finished")
2019-06-26 00:41:14 +00:00
return result
2019-09-22 10:42:47 +00:00
# Test simple put.
@pytest.mark.parametrize("maybe_auth,positive", [
2019-12-03 16:23:24 +00:00
("", True),
("'minio','minio123',", True),
("'wrongid','wrongkey',", False)
])
def test_put(cluster, maybe_auth, positive):
# type: (ClickHouseCluster) -> None
2019-09-19 09:34:33 +00:00
bucket = cluster.minio_bucket if not maybe_auth else cluster.minio_restricted_bucket
instance = cluster.instances["dummy"] # type: ClickHouseInstance
table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
2019-09-22 10:42:47 +00:00
values = "(1, 2, 3), (3, 2, 1), (78, 43, 45)"
values_csv = "1,2,3\n3,2,1\n78,43,45\n"
filename = "test.csv"
put_query = "insert into table function s3('http://{}:{}/{}/{}', {}'CSV', '{}') values {}".format(
cluster.minio_host, cluster.minio_port, bucket, filename, maybe_auth, table_format, values)
try:
run_query(instance, put_query)
except helpers.client.QueryRuntimeException:
2019-12-03 16:23:24 +00:00
if positive:
raise
else:
assert positive
assert values_csv == get_s3_file_content(cluster, bucket, filename)
# Test put values in CSV format.
@pytest.mark.parametrize("maybe_auth,positive", [
2019-12-03 16:23:24 +00:00
("", True),
("'minio','minio123',", True),
("'wrongid','wrongkey',", False)
])
def test_put_csv(cluster, maybe_auth, positive):
# type: (ClickHouseCluster) -> None
bucket = cluster.minio_bucket if not maybe_auth else cluster.minio_restricted_bucket
instance = cluster.instances["dummy"] # type: ClickHouseInstance
table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
filename = "test.csv"
put_query = "insert into table function s3('http://{}:{}/{}/{}', {}'CSV', '{}') format CSV".format(
cluster.minio_host, cluster.minio_port, bucket, filename, maybe_auth, table_format)
2019-09-22 10:42:47 +00:00
csv_data = "8,9,16\n11,18,13\n22,14,2\n"
try:
run_query(instance, put_query, stdin=csv_data)
except helpers.client.QueryRuntimeException:
2019-12-03 16:23:24 +00:00
if positive:
raise
else:
assert positive
assert csv_data == get_s3_file_content(cluster, bucket, filename)
# Test put and get with S3 server redirect.
def test_put_get_with_redirect(cluster):
# type: (ClickHouseCluster) -> None
bucket = cluster.minio_bucket
instance = cluster.instances["dummy"] # type: ClickHouseInstance
table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
values = "(1, 1, 1), (1, 1, 1), (11, 11, 11)"
values_csv = "1,1,1\n1,1,1\n11,11,11\n"
filename = "test.csv"
query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format(
cluster.minio_redirect_host, cluster.minio_redirect_port, bucket, filename, table_format, values)
2019-09-19 09:34:33 +00:00
run_query(instance, query)
assert values_csv == get_s3_file_content(cluster, bucket, filename)
query = "select *, column1*column2*column3 from s3('http://{}:{}/{}/{}', 'CSV', '{}')".format(
cluster.minio_redirect_host, cluster.minio_redirect_port, bucket, filename, table_format)
2019-09-19 09:34:33 +00:00
stdout = run_query(instance, query)
2019-09-19 09:34:33 +00:00
assert list(map(str.split, stdout.splitlines())) == [
2019-09-22 10:42:47 +00:00
["1", "1", "1", "1"],
["1", "1", "1", "1"],
["11", "11", "11", "1331"],
2019-06-26 00:41:14 +00:00
]
2020-01-27 21:44:18 +00:00
def test_put_get_with_globs(cluster):
# type: (ClickHouseCluster) -> None
bucket = cluster.minio_bucket
instance = cluster.instances["dummy"] # type: ClickHouseInstance
table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
max_path = ""
2020-01-27 21:44:18 +00:00
for i in range(10):
for j in range(10):
path = "{}_{}/{}.csv".format(i, random.choice(['a', 'b', 'c', 'd']), j)
max_path = max(path, max_path)
values = "({},{},{})".format(i, j, i + j)
2020-01-27 21:44:18 +00:00
query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format(
cluster.minio_host, cluster.minio_port, bucket, path, table_format, values)
run_query(instance, query)
query = "select sum(column1), sum(column2), sum(column3), min(_file), max(_path) from s3('http://{}:{}/{}/*_{{a,b,c,d}}/%3f.csv', 'CSV', '{}')".format(
2020-01-27 21:44:18 +00:00
cluster.minio_redirect_host, cluster.minio_redirect_port, bucket, table_format)
assert run_query(instance, query).splitlines() == [
"450\t450\t900\t0.csv\t{bucket}/{max_path}".format(bucket=bucket, max_path=max_path)]
2020-01-27 21:44:18 +00:00
# Test multipart put.
@pytest.mark.parametrize("maybe_auth,positive", [
2019-12-03 16:23:24 +00:00
("", True),
# ("'minio','minio123',",True), Redirect with credentials not working with nginx.
("'wrongid','wrongkey',", False)
])
def test_multipart_put(cluster, maybe_auth, positive):
# type: (ClickHouseCluster) -> None
bucket = cluster.minio_bucket if not maybe_auth else cluster.minio_restricted_bucket
instance = cluster.instances["dummy"] # type: ClickHouseInstance
table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
# Minimum size of part is 5 Mb for Minio.
# See: https://github.com/minio/minio/blob/master/docs/minio-limits.md
min_part_size_bytes = 5 * 1024 * 1024
csv_size_bytes = int(min_part_size_bytes * 1.5) # To have 2 parts.
one_line_length = 6 # 3 digits, 2 commas, 1 line separator.
# Generate data having size more than one part
int_data = [[1, 2, 3] for i in range(csv_size_bytes / one_line_length)]
csv_data = "".join(["{},{},{}\n".format(x, y, z) for x, y, z in int_data])
assert len(csv_data) > min_part_size_bytes
filename = "test_multipart.csv"
put_query = "insert into table function s3('http://{}:{}/{}/{}', {}'CSV', '{}') format CSV".format(
cluster.minio_redirect_host, cluster.minio_redirect_port, bucket, filename, maybe_auth, table_format)
try:
run_query(instance, put_query, stdin=csv_data, settings={'s3_min_upload_part_size': min_part_size_bytes})
except helpers.client.QueryRuntimeException:
2019-12-03 16:23:24 +00:00
if positive:
raise
else:
assert positive
2020-07-10 19:42:18 +00:00
# Use proxy access logs to count number of parts uploaded to Minio.
proxy_logs = cluster.get_container_logs("proxy1") # type: str
assert proxy_logs.count("PUT /{}/{}".format(bucket, filename)) >= 2
assert csv_data == get_s3_file_content(cluster, bucket, filename)
2019-11-06 17:06:50 +00:00
2019-12-03 17:36:02 +00:00
def test_remote_host_filter(cluster):
instance = cluster.instances["restricted_dummy"]
2019-11-06 17:06:50 +00:00
format = "column1 UInt32, column2 UInt32, column3 UInt32"
query = "select *, column1*column2*column3 from s3('http://{}:{}/{}/test.csv', 'CSV', '{}')".format(
"invalid_host", cluster.minio_port, cluster.minio_bucket, format)
2019-11-06 17:06:50 +00:00
assert "not allowed in config.xml" in instance.query_and_get_error(query)
other_values = "(1, 1, 1), (1, 1, 1), (11, 11, 11)"
query = "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') values {}".format(
"invalid_host", cluster.minio_port, cluster.minio_bucket, format, other_values)
2019-11-06 17:06:50 +00:00
assert "not allowed in config.xml" in instance.query_and_get_error(query)
@pytest.mark.parametrize("s3_storage_args", [
"''", # 1 arguments
"'','','','','',''" # 6 arguments
])
def test_wrong_s3_syntax(cluster, s3_storage_args):
instance = cluster.instances["dummy"] # type: ClickHouseInstance
expected_err_msg = "Code: 42" # NUMBER_OF_ARGUMENTS_DOESNT_MATCH
query = "create table test_table_s3_syntax (id UInt32) ENGINE = S3({})".format(s3_storage_args)
assert expected_err_msg in instance.query_and_get_error(query)
2020-05-25 09:15:11 +00:00
2020-05-25 21:05:15 +00:00
# https://en.wikipedia.org/wiki/One_Thousand_and_One_Nights
2020-05-25 09:15:11 +00:00
def test_s3_glob_scheherazade(cluster):
bucket = cluster.minio_bucket
instance = cluster.instances["dummy"] # type: ClickHouseInstance
table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
max_path = ""
values = "(1, 1, 1)"
nights_per_job = 1001 // 30
jobs = []
for night in range(0, 1001, nights_per_job):
def add_tales(start, end):
for i in range(start, end):
path = "night_{}/tale.csv".format(i)
query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format(
cluster.minio_host, cluster.minio_port, bucket, path, table_format, values)
run_query(instance, query)
jobs.append(threading.Thread(target=add_tales, args=(night, min(night + nights_per_job, 1001))))
2020-05-25 09:15:11 +00:00
jobs[-1].start()
for job in jobs:
job.join()
query = "select count(), sum(column1), sum(column2), sum(column3) from s3('http://{}:{}/{}/night_*/tale.csv', 'CSV', '{}')".format(
cluster.minio_redirect_host, cluster.minio_redirect_port, bucket, table_format)
assert run_query(instance, query).splitlines() == ["1001\t1001\t1001\t1001"]
def run_s3_mock(cluster):
logging.info("Starting s3 mock")
container_id = cluster.get_container_id('resolver')
current_dir = os.path.dirname(__file__)
cluster.copy_file_to_container(container_id, os.path.join(current_dir, "s3_mock", "mock_s3.py"), "mock_s3.py")
cluster.exec_in_container(container_id, ["python", "mock_s3.py"], detach=True)
logging.info("S3 mock started")
2020-07-10 19:42:18 +00:00
def test_custom_auth_headers(cluster):
ping_response = cluster.exec_in_container(cluster.get_container_id('resolver'),
["curl", "-s", "http://resolver:8080"])
assert ping_response == 'OK', 'Expected "OK", but got "{}"'.format(ping_response)
table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
filename = "test.csv"
get_query = "select * from s3('http://resolver:8080/{bucket}/{file}', 'CSV', '{table_format}')".format(
bucket=cluster.minio_restricted_bucket,
file=filename,
table_format=table_format)
instance = cluster.instances["dummy"] # type: ClickHouseInstance
result = run_query(instance, get_query)
assert result == '1\t2\t3\n'
def test_infinite_redirect(cluster):
bucket = "redirected"
table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
filename = "test.csv"
get_query = "select * from s3('http://resolver:8080/{bucket}/{file}', 'CSV', '{table_format}')".format(
2020-07-10 19:42:18 +00:00
bucket=bucket,
file=filename,
table_format=table_format)
instance = cluster.instances["dummy"] # type: ClickHouseInstance
2020-07-10 19:42:18 +00:00
exception_raised = False
try:
2020-07-10 19:42:18 +00:00
run_query(instance, get_query)
except Exception as e:
assert str(e).find("Too many redirects while trying to access") != -1
2020-07-10 19:42:18 +00:00
exception_raised = True
finally:
assert exception_raised