ClickHouse/tests/integration/test_storage_s3/test.py

import gzip
import json
import logging
import os
import io
import random
import threading
import time

import helpers.client
import pytest
from helpers.cluster import ClickHouseCluster, ClickHouseInstance, get_instances_dir
from helpers.network import PartitionManager
from helpers.test_tools import exec_query_with_retry

MINIO_INTERNAL_PORT = 9001

SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))

CONFIG_PATH = os.path.join(SCRIPT_DIR, './{}/dummy/configs/config.d/defaultS3.xml'.format(get_instances_dir()))


# Creates S3 bucket for tests and allows anonymous read-write access to it.
def prepare_s3_bucket(started_cluster):
    # Allows read-write access for bucket without authorization.
    bucket_read_write_policy = {"Version": "2012-10-17",
                                "Statement": [
                                    {
                                        "Sid": "",
                                        "Effect": "Allow",
                                        "Principal": {"AWS": "*"},
                                        "Action": "s3:GetBucketLocation",
                                        "Resource": "arn:aws:s3:::root"
                                    },
                                    {
                                        "Sid": "",
                                        "Effect": "Allow",
                                        "Principal": {"AWS": "*"},
                                        "Action": "s3:ListBucket",
                                        "Resource": "arn:aws:s3:::root"
                                    },
                                    {
                                        "Sid": "",
                                        "Effect": "Allow",
                                        "Principal": {"AWS": "*"},
                                        "Action": "s3:GetObject",
                                        "Resource": "arn:aws:s3:::root/*"
                                    },
                                    {
                                        "Sid": "",
                                        "Effect": "Allow",
                                        "Principal": {"AWS": "*"},
                                        "Action": "s3:PutObject",
                                        "Resource": "arn:aws:s3:::root/*"
                                    }
                                ]}

    minio_client = started_cluster.minio_client
    minio_client.set_bucket_policy(started_cluster.minio_bucket, json.dumps(bucket_read_write_policy))

    started_cluster.minio_restricted_bucket = "{}-with-auth".format(started_cluster.minio_bucket)
    if minio_client.bucket_exists(started_cluster.minio_restricted_bucket):
        minio_client.remove_bucket(started_cluster.minio_restricted_bucket)

    minio_client.make_bucket(started_cluster.minio_restricted_bucket)


def put_s3_file_content(started_cluster, bucket, filename, data):
    buf = io.BytesIO(data)
    started_cluster.minio_client.put_object(bucket, filename, buf, len(data))


# Returns content of given S3 file as string.
def get_s3_file_content(started_cluster, bucket, filename, decode=True):
    # type: (ClickHouseCluster, str, str, bool) -> str

    data = started_cluster.minio_client.get_object(bucket, filename)
    data_str = b""
    for chunk in data.stream():
        data_str += chunk
    if decode:
        return data_str.decode()
    return data_str


@pytest.fixture(scope="module")
def started_cluster():
    try:
        cluster = ClickHouseCluster(__file__)
        cluster.add_instance("restricted_dummy", main_configs=["configs/config_for_test_remote_host_filter.xml"],
                             with_minio=True)
        cluster.add_instance("dummy", with_minio=True, main_configs=["configs/defaultS3.xml", "configs/named_collections.xml"])
        cluster.add_instance("s3_max_redirects", with_minio=True, main_configs=["configs/defaultS3.xml"],
                             user_configs=["configs/s3_max_redirects.xml"])
        logging.info("Starting cluster...")
        cluster.start()
        logging.info("Cluster started")

        prepare_s3_bucket(cluster)
        logging.info("S3 bucket created")
        run_s3_mocks(cluster)

        yield cluster
    finally:
        cluster.shutdown()


def run_query(instance, query, stdin=None, settings=None):
    # type: (ClickHouseInstance, str, object, dict) -> str

    logging.info("Running query '{}'...".format(query))
    result = instance.query(query, stdin=stdin, settings=settings)
    logging.info("Query finished")

    return result


# Test simple put. Also checks that wrong credentials produce an error with every compression method.
@pytest.mark.parametrize("maybe_auth,positive,compression", [
    pytest.param("", True, 'auto', id="positive"),
    pytest.param("'minio','minio123',", True, 'auto', id="auth_positive"),
    pytest.param("'wrongid','wrongkey',", False, 'auto', id="auto"),
    pytest.param("'wrongid','wrongkey',", False, 'gzip', id="gzip"),
    pytest.param("'wrongid','wrongkey',", False, 'deflate', id="deflate"),
    pytest.param("'wrongid','wrongkey',", False, 'brotli', id="brotli"),
    pytest.param("'wrongid','wrongkey',", False, 'xz', id="xz"),
    pytest.param("'wrongid','wrongkey',", False, 'zstd', id="zstd")
])
def test_put(started_cluster, maybe_auth, positive, compression):
    # type: (ClickHouseCluster) -> None

    bucket = started_cluster.minio_bucket if not maybe_auth else started_cluster.minio_restricted_bucket
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
    values = "(1, 2, 3), (3, 2, 1), (78, 43, 45)"
    values_csv = "1,2,3\n3,2,1\n78,43,45\n"
    filename = "test.csv"
    put_query = f"""insert into table function s3('http://{started_cluster.minio_ip}:{started_cluster.minio_port}/{bucket}/{filename}',
                    {maybe_auth}'CSV', '{table_format}', {compression}) values {values}"""

    try:
        run_query(instance, put_query)
    except helpers.client.QueryRuntimeException:
        if positive:
            raise
    else:
        assert positive
        assert values_csv == get_s3_file_content(started_cluster, bucket, filename)


def test_partition_by(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
    partition_by = "column3"
    values = "(1, 2, 3), (3, 2, 1), (78, 43, 45)"
    filename = "test_{_partition_id}.csv"
    put_query = f"""INSERT INTO TABLE FUNCTION
        s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'CSV', '{table_format}')
        PARTITION BY {partition_by} VALUES {values}"""

    run_query(instance, put_query)

    assert "1,2,3\n" == get_s3_file_content(started_cluster, bucket, "test_3.csv")
    assert "3,2,1\n" == get_s3_file_content(started_cluster, bucket, "test_1.csv")
    assert "78,43,45\n" == get_s3_file_content(started_cluster, bucket, "test_45.csv")

    filename = "test2_{_partition_id}.csv"
    instance.query(f"create table p ({table_format}) engine=S3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'CSV') partition by column3")
    instance.query(f"insert into p values {values}")
    assert "1,2,3\n" == get_s3_file_content(started_cluster, bucket, "test2_3.csv")
    assert "3,2,1\n" == get_s3_file_content(started_cluster, bucket, "test2_1.csv")
    assert "78,43,45\n" == get_s3_file_content(started_cluster, bucket, "test2_45.csv")


def test_partition_by_string_column(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    table_format = "col_num UInt32, col_str String"
    partition_by = "col_str"
    values = "(1, 'foo/bar'), (3, 'йцук'), (78, '你好')"
    filename = "test_{_partition_id}.csv"
    put_query = f"""INSERT INTO TABLE FUNCTION
        s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'CSV', '{table_format}')
        PARTITION BY {partition_by} VALUES {values}"""

    run_query(instance, put_query)

    assert '1,"foo/bar"\n' == get_s3_file_content(started_cluster, bucket, "test_foo/bar.csv")
    assert '3,"йцук"\n' == get_s3_file_content(started_cluster, bucket, "test_йцук.csv")
    assert '78,"你好"\n' == get_s3_file_content(started_cluster, bucket, "test_你好.csv")


def test_partition_by_const_column(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
    values = "(1, 2, 3), (3, 2, 1), (78, 43, 45)"
    partition_by = "'88'"
    values_csv = "1,2,3\n3,2,1\n78,43,45\n"
    filename = "test_{_partition_id}.csv"
    put_query = f"""INSERT INTO TABLE FUNCTION
        s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'CSV', '{table_format}')
        PARTITION BY {partition_by} VALUES {values}"""

    run_query(instance, put_query)

    assert values_csv == get_s3_file_content(started_cluster, bucket, "test_88.csv")


@pytest.mark.parametrize("special", [
    "space",
    "plus"
])
def test_get_file_with_special(started_cluster, special):
    symbol = {"space": " ", "plus": "+"}[special]
    urlsafe_symbol = {"space": "%20", "plus": "%2B"}[special]
    auth = "'minio','minio123',"
    bucket = started_cluster.minio_restricted_bucket
    instance = started_cluster.instances["dummy"]
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
    values = [[12549, 2463, 19893], [64021, 38652, 66703], [81611, 39650, 83516], [11079, 59507, 61546], [51764, 69952, 6876], [41165, 90293, 29095], [40167, 78432, 48309], [81629, 81327, 11855], [55852, 21643, 98507], [6738, 54643, 41155]]
    values_csv = ('\n'.join((','.join(map(str, row)) for row in values)) + '\n').encode()
    filename = f"get_file_with_{special}_{symbol}two.csv"
    put_s3_file_content(started_cluster, bucket, filename, values_csv)

    get_query = f"SELECT * FROM s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/get_file_with_{special}_{urlsafe_symbol}two.csv', {auth}'CSV', '{table_format}') FORMAT TSV"
    assert [list(map(int, l.split())) for l in run_query(instance, get_query).splitlines()] == values

    get_query = f"SELECT * FROM s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/get_file_with_{special}*.csv', {auth}'CSV', '{table_format}') FORMAT TSV"
    assert [list(map(int, l.split())) for l in run_query(instance, get_query).splitlines()] == values

    get_query = f"SELECT * FROM s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/get_file_with_{special}_{urlsafe_symbol}*.csv', {auth}'CSV', '{table_format}') FORMAT TSV"
    assert [list(map(int, l.split())) for l in run_query(instance, get_query).splitlines()] == values


@pytest.mark.parametrize("special", [
    "space",
    "plus",
    "plus2"
])
def test_get_path_with_special(started_cluster, special):
    symbol = {"space": "%20", "plus": "%2B", "plus2": "%2B"}[special]
    safe_symbol = {"space": "%20", "plus": "+", "plus2": "%2B"}[special]
    auth = "'minio','minio123',"
    table_format = "column1 String"
    instance = started_cluster.instances["dummy"]
    get_query = f"SELECT * FROM s3('http://resolver:8082/get-my-path/{safe_symbol}.csv', {auth}'CSV', '{table_format}') FORMAT TSV"
    assert run_query(instance, get_query).splitlines() == [f"/{symbol}.csv"]


# Test put no data to S3.
@pytest.mark.parametrize("auth", [
    pytest.param("'minio','minio123',", id="minio")
])
def test_empty_put(started_cluster, auth):
    # type: (ClickHouseCluster, str) -> None

    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"

    drop_empty_table_query = "DROP TABLE IF EXISTS empty_table"
    create_empty_table_query = """
        CREATE TABLE empty_table (
        {}
        ) ENGINE = Null()
    """.format(table_format)

    run_query(instance, drop_empty_table_query)
    run_query(instance, create_empty_table_query)

    filename = "empty_put_test.csv"
    put_query = "insert into table function s3('http://{}:{}/{}/{}', {}'CSV', '{}') select * from empty_table".format(
        started_cluster.minio_ip, MINIO_INTERNAL_PORT, bucket, filename, auth, table_format)

    run_query(instance, put_query)

    try:
        run_query(instance, "select count(*) from s3('http://{}:{}/{}/{}', {}'CSV', '{}')".format(
            started_cluster.minio_ip, MINIO_INTERNAL_PORT, bucket, filename, auth, table_format))

        assert False, "Query should be failed."
    except helpers.client.QueryRuntimeException as e:
        assert str(e).find("The specified key does not exist") != 0


# Test put values in CSV format.
@pytest.mark.parametrize("maybe_auth,positive", [
    pytest.param("", True, id="positive"),
    pytest.param("'minio','minio123',", True, id="auth_positive"),
    pytest.param("'wrongid','wrongkey',", False, id="negative"),
])
def test_put_csv(started_cluster, maybe_auth, positive):
    # type: (ClickHouseCluster, bool, str) -> None

    bucket = started_cluster.minio_bucket if not maybe_auth else started_cluster.minio_restricted_bucket
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
    filename = "test.csv"
    put_query = "insert into table function s3('http://{}:{}/{}/{}', {}'CSV', '{}') format CSV".format(
        started_cluster.minio_ip, MINIO_INTERNAL_PORT, bucket, filename, maybe_auth, table_format)
    csv_data = "8,9,16\n11,18,13\n22,14,2\n"

    try:
        run_query(instance, put_query, stdin=csv_data)
    except helpers.client.QueryRuntimeException:
        if positive:
            raise
    else:
        assert positive
        assert csv_data == get_s3_file_content(started_cluster, bucket, filename)


# Test put and get with S3 server redirect.
def test_put_get_with_redirect(started_cluster):
    # type: (ClickHouseCluster) -> None

    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
    values = "(1, 1, 1), (1, 1, 1), (11, 11, 11)"
    values_csv = "1,1,1\n1,1,1\n11,11,11\n"
    filename = "test.csv"
    query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format(
        started_cluster.minio_redirect_host, started_cluster.minio_redirect_port, bucket, filename, table_format, values)
    run_query(instance, query)

    assert values_csv == get_s3_file_content(started_cluster, bucket, filename)

    query = "select *, column1*column2*column3 from s3('http://{}:{}/{}/{}', 'CSV', '{}')".format(
        started_cluster.minio_redirect_host, started_cluster.minio_redirect_port, bucket, filename, table_format)
    stdout = run_query(instance, query)

    assert list(map(str.split, stdout.splitlines())) == [
        ["1", "1", "1", "1"],
        ["1", "1", "1", "1"],
        ["11", "11", "11", "1331"],
    ]


# Test put with restricted S3 server redirect.
def test_put_with_zero_redirect(started_cluster):
    # type: (ClickHouseCluster) -> None

    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["s3_max_redirects"]  # type: ClickHouseInstance
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
    values = "(1, 1, 1), (1, 1, 1), (11, 11, 11)"
    filename = "test.csv"

    # Should work without redirect
    query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format(
        started_cluster.minio_ip, MINIO_INTERNAL_PORT, bucket, filename, table_format, values)
    run_query(instance, query)

    # Should not work with redirect
    query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format(
        started_cluster.minio_redirect_host, started_cluster.minio_redirect_port, bucket, filename, table_format, values)
    exception_raised = False
    try:
        run_query(instance, query)
    except Exception as e:
        assert str(e).find("Too many redirects while trying to access") != -1
        exception_raised = True
    finally:
        assert exception_raised


def test_put_get_with_globs(started_cluster):
    # type: (ClickHouseCluster) -> None
    unique_prefix = random.randint(1,10000)
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
    max_path = ""
    for i in range(10):
        for j in range(10):
            path = "{}/{}_{}/{}.csv".format(unique_prefix, i, random.choice(['a', 'b', 'c', 'd']), j)
            max_path = max(path, max_path)
            values = "({},{},{})".format(i, j, i + j)
            query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format(
                started_cluster.minio_ip, MINIO_INTERNAL_PORT, bucket, path, table_format, values)
            run_query(instance, query)

    query = "select sum(column1), sum(column2), sum(column3), min(_file), max(_path) from s3('http://{}:{}/{}/{}/*_{{a,b,c,d}}/%3f.csv', 'CSV', '{}')".format(
        started_cluster.minio_redirect_host, started_cluster.minio_redirect_port, bucket, unique_prefix, table_format)
    assert run_query(instance, query).splitlines() == [
        "450\t450\t900\t0.csv\t{bucket}/{max_path}".format(bucket=bucket, max_path=max_path)]

    minio = started_cluster.minio_client
    for obj in list(minio.list_objects(started_cluster.minio_bucket, prefix='{}/'.format(unique_prefix), recursive=True)):
        minio.remove_object(started_cluster.minio_bucket, obj.object_name)


# Test multipart put.
@pytest.mark.parametrize("maybe_auth,positive", [
    pytest.param("", True, id="positive"),
    pytest.param("'wrongid','wrongkey'", False, id="negative"),
    # ("'minio','minio123',",True), Redirect with credentials not working with nginx.
])
def test_multipart_put(started_cluster, maybe_auth, positive):
    # type: (ClickHouseCluster) -> None

    bucket = started_cluster.minio_bucket if not maybe_auth else started_cluster.minio_restricted_bucket
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"

    # Minimum size of part is 5 Mb for Minio.
    # See: https://github.com/minio/minio/blob/master/docs/minio-limits.md
    min_part_size_bytes = 5 * 1024 * 1024
    csv_size_bytes = int(min_part_size_bytes * 1.5)  # To have 2 parts.

    one_line_length = 6  # 3 digits, 2 commas, 1 line separator.

    # Generate data having size more than one part
    int_data = [[1, 2, 3] for i in range(csv_size_bytes // one_line_length)]
    csv_data = "".join(["{},{},{}\n".format(x, y, z) for x, y, z in int_data])

    assert len(csv_data) > min_part_size_bytes

    filename = "test_multipart.csv"
    put_query = "insert into table function s3('http://{}:{}/{}/{}', {}'CSV', '{}') format CSV".format(
        started_cluster.minio_redirect_host, started_cluster.minio_redirect_port, bucket, filename, maybe_auth, table_format)

    try:
        run_query(instance, put_query, stdin=csv_data, settings={'s3_min_upload_part_size': min_part_size_bytes,
                                                                 's3_max_single_part_upload_size': 0})
    except helpers.client.QueryRuntimeException:
        if positive:
            raise
    else:
        assert positive

        # Use proxy access logs to count number of parts uploaded to Minio.
        proxy_logs = started_cluster.get_container_logs("proxy1")  # type: str
        assert proxy_logs.count("PUT /{}/{}".format(bucket, filename)) >= 2

        assert csv_data == get_s3_file_content(started_cluster, bucket, filename)


def test_remote_host_filter(started_cluster):
    instance = started_cluster.instances["restricted_dummy"]
    format = "column1 UInt32, column2 UInt32, column3 UInt32"

    query = "select *, column1*column2*column3 from s3('http://{}:{}/{}/test.csv', 'CSV', '{}')".format(
        "invalid_host", MINIO_INTERNAL_PORT, started_cluster.minio_bucket, format)
    assert "not allowed in configuration file" in instance.query_and_get_error(query)

    other_values = "(1, 1, 1), (1, 1, 1), (11, 11, 11)"
    query = "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') values {}".format(
        "invalid_host", MINIO_INTERNAL_PORT, started_cluster.minio_bucket, format, other_values)
    assert "not allowed in configuration file" in instance.query_and_get_error(query)


@pytest.mark.parametrize("s3_storage_args", [
    pytest.param("''", id="1_argument"),
    pytest.param("'','','','','',''", id="6_arguments"),
])
def test_wrong_s3_syntax(started_cluster, s3_storage_args):
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    expected_err_msg = "Code: 42"  # NUMBER_OF_ARGUMENTS_DOESNT_MATCH

    query = "create table test_table_s3_syntax (id UInt32) ENGINE = S3({})".format(s3_storage_args)
    assert expected_err_msg in instance.query_and_get_error(query)


# https://en.wikipedia.org/wiki/One_Thousand_and_One_Nights
def test_s3_glob_scheherazade(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
    max_path = ""
    values = "(1, 1, 1)"
    nights_per_job = 1001 // 30
    jobs = []
    for night in range(0, 1001, nights_per_job):
        def add_tales(start, end):
            for i in range(start, end):
                path = "night_{}/tale.csv".format(i)
                query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format(
                    started_cluster.minio_ip, MINIO_INTERNAL_PORT, bucket, path, table_format, values)
                run_query(instance, query)

        jobs.append(threading.Thread(target=add_tales, args=(night, min(night + nights_per_job, 1001))))
        jobs[-1].start()

    for job in jobs:
        job.join()

    query = "select count(), sum(column1), sum(column2), sum(column3) from s3('http://{}:{}/{}/night_*/tale.csv', 'CSV', '{}')".format(
        started_cluster.minio_redirect_host, started_cluster.minio_redirect_port, bucket, table_format)
    assert run_query(instance, query).splitlines() == ["1001\t1001\t1001\t1001"]


def run_s3_mocks(started_cluster):
    logging.info("Starting s3 mocks")
    mocks = (
        ("mock_s3.py", "resolver", "8080"),
        ("unstable_server.py", "resolver", "8081"),
        ("echo.py", "resolver", "8082"),
    )
    for mock_filename, container, port in mocks:
        container_id = started_cluster.get_container_id(container)
        current_dir = os.path.dirname(__file__)
        started_cluster.copy_file_to_container(container_id, os.path.join(current_dir, "s3_mocks", mock_filename), mock_filename)
        started_cluster.exec_in_container(container_id, ["python", mock_filename, port], detach=True)

    # Wait for S3 mocks to start
    for mock_filename, container, port in mocks:
        num_attempts = 100
        for attempt in range(num_attempts):
            ping_response = started_cluster.exec_in_container(started_cluster.get_container_id(container),
                                                              ["curl", "-s", f"http://localhost:{port}/"], nothrow=True)
            if ping_response != 'OK':
                if attempt == num_attempts - 1:
                    assert ping_response == 'OK', 'Expected "OK", but got "{}"'.format(ping_response)
                else:
                    time.sleep(1)
            else:
                logging.debug(f"mock {mock_filename} ({port}) answered {ping_response} on attempt {attempt}")
                break

    logging.info("S3 mocks started")


def replace_config(old, new):
    config = open(CONFIG_PATH, 'r')
    config_lines = config.readlines()
    config.close()
    config_lines = [line.replace(old, new) for line in config_lines]
    config = open(CONFIG_PATH, 'w')
    config.writelines(config_lines)
    config.close()


def test_custom_auth_headers(started_cluster):
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
    filename = "test.csv"
    get_query = "select * from s3('http://resolver:8080/{bucket}/{file}', 'CSV', '{table_format}')".format(
        bucket=started_cluster.minio_restricted_bucket,
        file=filename,
        table_format=table_format)

    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    result = run_query(instance, get_query)
    assert result == '1\t2\t3\n'

    instance.query("DROP TABLE IF EXISTS test")
    instance.query(
        "CREATE TABLE test ({table_format}) ENGINE = S3('http://resolver:8080/{bucket}/{file}', 'CSV')".format(
            bucket=started_cluster.minio_restricted_bucket,
            file=filename,
            table_format=table_format
        ))
    assert run_query(instance, "SELECT * FROM test") == '1\t2\t3\n'

    replace_config("<header>Authorization: Bearer TOKEN", "<header>Authorization: Bearer INVALID_TOKEN")
    instance.query("SYSTEM RELOAD CONFIG")
    ret, err = instance.query_and_get_answer_with_error("SELECT * FROM test")
    assert ret == "" and err != ""
    replace_config("<header>Authorization: Bearer INVALID_TOKEN", "<header>Authorization: Bearer TOKEN")
    instance.query("SYSTEM RELOAD CONFIG")
    assert run_query(instance, "SELECT * FROM test") == '1\t2\t3\n'
    instance.query("DROP TABLE test")


def test_custom_auth_headers_exclusion(started_cluster):
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
    filename = "test.csv"
    get_query = f"SELECT * FROM s3('http://resolver:8080/{started_cluster.minio_restricted_bucket}/restricteddirectory/{filename}', 'CSV', '{table_format}')"

    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    with pytest.raises(helpers.client.QueryRuntimeException) as ei:
        result = run_query(instance, get_query)
        print(result)

    assert ei.value.returncode == 243
    assert 'Forbidden Error' in ei.value.stderr


def test_infinite_redirect(started_cluster):
    bucket = "redirected"
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
    filename = "test.csv"
    get_query = f"select * from s3('http://resolver:{started_cluster.minio_redirect_port}/{bucket}/{filename}', 'CSV', '{table_format}')"
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    exception_raised = False
    try:
        run_query(instance, get_query)
    except Exception as e:
        assert str(e).find("Too many redirects while trying to access") != -1
        exception_raised = True
    finally:
        assert exception_raised
@pytest.mark.parametrize("extension,method", [
    pytest.param("bin", "gzip", id="bin"),
    pytest.param("gz", "auto", id="gz"),
])
def test_storage_s3_get_gzip(started_cluster, extension, method):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]
    filename = f"test_get_gzip.{extension}"
    name = f"test_get_gzip_{extension}"
    data = [
        "Sophia Intrieri,55",
        "Jack Taylor,71",
        "Christopher Silva,66",
        "Clifton Purser,35",
        "Richard Aceuedo,43",
        "Lisa Hensley,31",
        "Alice Wehrley,1",
        "Mary Farmer,47",
        "Samara Ramirez,19",
        "Shirley Lloyd,51",
        "Santos Cowger,0",
        "Richard Mundt,88",
        "Jerry Gonzalez,15",
        "Angela James,10",
        "Norman Ortega,33",
        ""
    ]
    run_query(instance, f"DROP TABLE IF EXISTS {name}")

    buf = io.BytesIO()
    compressed = gzip.GzipFile(fileobj=buf, mode="wb")
    compressed.write(("\n".join(data)).encode())
    compressed.close()
    put_s3_file_content(started_cluster, bucket, filename, buf.getvalue())

    run_query(instance, f"""CREATE TABLE {name} (name String, id UInt32) ENGINE = S3(
                                'http://{started_cluster.minio_ip}:{MINIO_INTERNAL_PORT}/{bucket}/{filename}',
                                'CSV',
                                '{method}')""")

    run_query(instance, f"SELECT sum(id) FROM {name}").splitlines() == ["565"]
    run_query(instance, f"DROP TABLE {name}")


def test_storage_s3_get_unstable(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]
    table_format = "column1 Int64, column2 Int64, column3 Int64, column4 Int64"
    get_query = f"SELECT count(), sum(column3), sum(column4) FROM s3('http://resolver:8081/{started_cluster.minio_bucket}/test.csv', 'CSV', '{table_format}') FORMAT CSV"
    result = run_query(instance, get_query)
    assert result.splitlines() == ["500001,500000,0"]


def test_storage_s3_put_uncompressed(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]
    filename = "test_put_uncompressed.bin"
    name = "test_put_uncompressed"
    data = [
        "'Gloria Thompson',99",
        "'Matthew Tang',98",
        "'Patsy Anderson',23",
        "'Nancy Badillo',93",
        "'Roy Hunt',5",
        "'Adam Kirk',51",
        "'Joshua Douds',28",
        "'Jolene Ryan',0",
        "'Roxanne Padilla',50",
        "'Howard Roberts',41",
        "'Ricardo Broughton',13",
        "'Roland Speer',83",
        "'Cathy Cohan',58",
        "'Kathie Dawson',100",
        "'Gregg Mcquistion',11",
    ]
    run_query(instance, "CREATE TABLE {} (name String, id UInt32) ENGINE = S3('http://{}:{}/{}/{}', 'CSV')".format(
        name, started_cluster.minio_ip, MINIO_INTERNAL_PORT, bucket, filename))

    run_query(instance, "INSERT INTO {} VALUES ({})".format(name, "),(".join(data)))

    run_query(instance, "SELECT sum(id) FROM {}".format(name)).splitlines() == ["753"]

    uncompressed_content = get_s3_file_content(started_cluster, bucket, filename)
    assert sum([ int(i.split(',')[1]) for i in uncompressed_content.splitlines() ]) == 753


@pytest.mark.parametrize("extension,method", [
    pytest.param("bin", "gzip", id="bin"),
    pytest.param("gz", "auto", id="gz")
])
def test_storage_s3_put_gzip(started_cluster, extension, method):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]
    filename = f"test_put_gzip.{extension}"
    name = f"test_put_gzip_{extension}"
    data = [
        "'Joseph Tomlinson',5",
        "'Earnest Essary',44",
        "'Matha Pannell',24",
        "'Michael Shavers',46",
        "'Elias Groce',38",
        "'Pamela Bramlet',50",
        "'Lewis Harrell',49",
        "'Tamara Fyall',58",
        "'George Dixon',38",
        "'Alice Walls',49",
        "'Paula Mais',24",
        "'Myrtle Pelt',93",
        "'Sylvia Naffziger',18",
        "'Amanda Cave',83",
        "'Yolanda Joseph',89"
    ]
    run_query(instance, f"""CREATE TABLE {name} (name String, id UInt32) ENGINE = S3(
                                'http://{started_cluster.minio_ip}:{MINIO_INTERNAL_PORT}/{bucket}/{filename}',
                                'CSV',
                                '{method}')""")

    run_query(instance, f"INSERT INTO {name} VALUES ({'),('.join(data)})")

    run_query(instance, f"SELECT sum(id) FROM {name}").splitlines() == ["708"]

    buf = io.BytesIO(get_s3_file_content(started_cluster, bucket, filename, decode=False))
    f = gzip.GzipFile(fileobj=buf, mode="rb")
    uncompressed_content = f.read().decode()
    assert sum([ int(i.split(',')[1]) for i in uncompressed_content.splitlines() ]) == 708


def test_truncate_table(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    name = "truncate"

    instance.query("CREATE TABLE {} (id UInt32) ENGINE = S3('http://{}:{}/{}/{}', 'CSV')".format(
        name, started_cluster.minio_ip, MINIO_INTERNAL_PORT, bucket, name))

    instance.query("INSERT INTO {} SELECT number FROM numbers(10)".format(name))
    result = instance.query("SELECT * FROM {}".format(name))
    assert result == instance.query("SELECT number FROM numbers(10)")
    instance.query("TRUNCATE TABLE {}".format(name))

    minio = started_cluster.minio_client
    timeout = 30
    while timeout > 0:
        if len(list(minio.list_objects(started_cluster.minio_bucket, 'truncate/'))) == 0:
            return
        timeout -= 1
        time.sleep(1)
    assert(len(list(minio.list_objects(started_cluster.minio_bucket, 'truncate/'))) == 0)
    assert instance.query("SELECT * FROM {}".format(name)) == ""


def test_predefined_connection_configuration(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    name = "test_table"

    instance.query("drop table if exists {}".format(name))
    instance.query("CREATE TABLE {} (id UInt32) ENGINE = S3(s3_conf1, format='CSV')".format(name))

    instance.query("INSERT INTO {} SELECT number FROM numbers(10)".format(name))
    result = instance.query("SELECT * FROM {}".format(name))
    assert result == instance.query("SELECT number FROM numbers(10)")

    result = instance.query("SELECT * FROM s3(s3_conf1, format='CSV', structure='id UInt32')")
    assert result == instance.query("SELECT number FROM numbers(10)")


result = ""
def test_url_reconnect_in_the_middle(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]
    table_format = "id String, data String"
    filename = "test_url_reconnect_{}.tsv".format(random.randint(0, 1000))

    instance.query(f"""insert into table function
                   s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'TSV', '{table_format}')
                   select number, randomPrintableASCII(number % 1000) from numbers(1000000)""")

    with PartitionManager() as pm:
        pm_rule_reject = {'probability': 0.02, 'destination': instance.ip_address, 'source_port': started_cluster.minio_port, 'action': 'REJECT --reject-with tcp-reset'}
        pm_rule_drop_all = {'destination': instance.ip_address, 'source_port': started_cluster.minio_port, 'action': 'DROP'}
        pm._add_rule(pm_rule_reject)

        def select():
            global result
            result = instance.query(
                f"""select sum(cityHash64(x)) from (select toUInt64(id) + sleep(0.1) as x from
                url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'TSV', '{table_format}')
                settings http_max_tries = 10, http_retry_max_backoff_ms=2000, http_send_timeout=1, http_receive_timeout=1)""")
            assert(int(result), 3914219105369203805)

        thread = threading.Thread(target=select)
        thread.start()
        time.sleep(4)
        pm._add_rule(pm_rule_drop_all)

        time.sleep(2)
        pm._delete_rule(pm_rule_drop_all)
        pm._delete_rule(pm_rule_reject)

        thread.join()

        assert(int(result), 3914219105369203805)


def test_seekable_formats(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance

    table_function = f"s3(s3_parquet, structure='a Int32, b String', format='Parquet')"
    instance.query(f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(5000000)")

    result = instance.query(f"SELECT count() FROM {table_function}")
    assert(int(result) == 5000000)

    table_function = f"s3(s3_orc, structure='a Int32, b String', format='ORC')"
    exec_query_with_retry(instance, f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(5000000)")

    result = instance.query(f"SELECT count() FROM {table_function}")
    assert(int(result) == 5000000)

    instance.query("SYSTEM FLUSH LOGS")
    result = instance.query(f"SELECT formatReadableSize(memory_usage) FROM system.query_log WHERE startsWith(query, 'SELECT count() FROM s3') AND memory_usage > 0 ORDER BY event_time desc")
    print(result[:3])
    assert(int(result[:3]) < 200)


def test_seekable_formats_url(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]

    table_function = f"s3(s3_parquet, structure='a Int32, b String', format='Parquet')"
    instance.query(f"insert into table function {table_function} select number, randomString(100) from numbers(5000000)")

    table_function = f"url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_parquet', 'Parquet', 'a Int32, b String')"
    result = instance.query(f"SELECT count() FROM {table_function}")
    assert(int(result) == 5000000)

    table_function = f"s3(s3_orc, structure='a Int32, b String', format='ORC')"
    exec_query_with_retry(instance, f"insert into table function {table_function} select number, randomString(100) from numbers(5000000)")

    table_function = f"url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_orc', 'ORC', 'a Int32, b String')"
    result = instance.query(f"SELECT count() FROM {table_function}")
    assert(int(result) == 5000000)

    instance.query("SYSTEM FLUSH LOGS")
    result = instance.query(f"SELECT formatReadableSize(memory_usage) FROM system.query_log WHERE startsWith(query, 'SELECT count() FROM url') AND memory_usage > 0 ORDER BY event_time desc")
    print(result[:3])
    assert(int(result[:3]) < 200)


def test_empty_file(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]

    name = "empty"
    url = f'http://{started_cluster.minio_ip}:{MINIO_INTERNAL_PORT}/{bucket}/{name}'

    minio = started_cluster.minio_client
    minio.put_object(bucket, name, io.BytesIO(b""), 0)

    table_function = f"s3('{url}', 'CSV', 'id Int32')"
    result = instance.query(f"SELECT count() FROM {table_function}")
    assert(int(result) == 0)


def test_insert_with_path_with_globs(started_cluster):
    instance = started_cluster.instances["dummy"]

    table_function_3 = f"s3('http://minio1:9001/root/test_parquet*', 'minio', 'minio123', 'Parquet', 'a Int32, b String')"
    instance.query_and_get_error(f"insert into table function {table_function_3} SELECT number, randomString(100) FROM numbers(500)")
-												Added test for GZIP in S3 storage.

											
										
										
											2020-09-28 23:30:41 +00:00
+								import gzip
-												Tests decomposition.

											
										
										
											2019-09-19 09:34:33 +00:00
+								import json
 								import logging
-												Format and optimize imports in integration test files

This PR formats all the `*.py` files found under the `tests/integration`
folder. It also reorders the imports and cleans up a bunch of unused
imports.

The formatting also takes care of other things like wrapping lines and
fixing spaces and indents such that the tests look more readable.

											
										
										
											2020-09-16 04:26:10 +00:00
+								import os
-												Convert to python3 (#15007)


											
										
										
											2020-10-02 16:54:07 +00:00
+								import io
-												Added some tests.

											
										
										
											2020-01-27 21:44:18 +00:00
+								import random
-												Added test for multi-page S3 globbing.

											
										
										
											2020-05-25 09:15:11 +00:00
+								import threading
-												Fixed bug in GZIP compression in S3 storage.

											
										
										
											2020-09-30 13:09:55 +00:00
+								import time
-												Tests decomposition.

											
										
										
											2019-09-19 09:34:33 +00:00
-												Format and optimize imports in integration test files

This PR formats all the `*.py` files found under the `tests/integration`
folder. It also reorders the imports and cleans up a bunch of unused
imports.

The formatting also takes care of other things like wrapping lines and
fixing spaces and indents such that the tests look more readable.

											
										
										
											2020-09-16 04:26:10 +00:00
+								import helpers.client
-												Attempt to make integration tests.

											
										
										
											2019-06-26 00:41:14 +00:00
+								import pytest
-												Fix some flaky tests

											
										
										
											2021-06-21 08:02:27 +00:00
+								from helpers.cluster import ClickHouseCluster, ClickHouseInstance, get_instances_dir
-												add test

											
										
										
											2021-11-09 20:11:02 +00:00
+								from helpers.network import PartitionManager
-												Fix test

											
										
										
											2021-12-22 10:57:08 +00:00
+								from helpers.test_tools import exec_query_with_retry
-												Tests decomposition.

											
										
										
											2019-09-19 09:34:33 +00:00
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								MINIO_INTERNAL_PORT = 9001
-												Tests decomposition.

											
										
										
											2019-09-19 09:34:33 +00:00
-												recreate S3 client if credentials changed

											
										
										
											2021-03-04 15:56:55 +00:00
+								SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
-												Fix some flaky tests

											
										
										
											2021-06-21 08:02:27 +00:00
 								CONFIG_PATH = os.path.join(SCRIPT_DIR, './{}/dummy/configs/config.d/defaultS3.xml'.format(get_instances_dir()))
-												recreate S3 client if credentials changed

											
										
										
											2021-03-04 15:56:55 +00:00
-												Tests decomposition.

											
										
										
											2019-09-19 09:34:33 +00:00
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								# Creates S3 bucket for tests and allows anonymous read-write access to it.
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								def prepare_s3_bucket(started_cluster):
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								    # Allows read-write access for bucket without authorization.
 								    bucket_read_write_policy = {"Version": "2012-10-17",
 								                                "Statement": [
 								                                    {
 								                                        "Sid": "",
 								                                        "Effect": "Allow",
 								                                        "Principal": {"AWS": "*"},
 								                                        "Action": "s3:GetBucketLocation",
 								                                        "Resource": "arn:aws:s3:::root"
 								                                    },
 								                                    {
 								                                        "Sid": "",
 								                                        "Effect": "Allow",
 								                                        "Principal": {"AWS": "*"},
 								                                        "Action": "s3:ListBucket",
 								                                        "Resource": "arn:aws:s3:::root"
 								                                    },
 								                                    {
 								                                        "Sid": "",
 								                                        "Effect": "Allow",
 								                                        "Principal": {"AWS": "*"},
 								                                        "Action": "s3:GetObject",
 								                                        "Resource": "arn:aws:s3:::root/*"
 								                                    },
 								                                    {
 								                                        "Sid": "",
 								                                        "Effect": "Allow",
 								                                        "Principal": {"AWS": "*"},
 								                                        "Action": "s3:PutObject",
 								                                        "Resource": "arn:aws:s3:::root/*"
 								                                    }
 								                                ]}
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    minio_client = started_cluster.minio_client
 								    minio_client.set_bucket_policy(started_cluster.minio_bucket, json.dumps(bucket_read_write_policy))
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    started_cluster.minio_restricted_bucket = "{}-with-auth".format(started_cluster.minio_bucket)
 								    if minio_client.bucket_exists(started_cluster.minio_restricted_bucket):
 								        minio_client.remove_bucket(started_cluster.minio_restricted_bucket)
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    minio_client.make_bucket(started_cluster.minio_restricted_bucket)
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								def put_s3_file_content(started_cluster, bucket, filename, data):
-												Convert to python3 (#15007)


											
										
										
											2020-10-02 16:54:07 +00:00
+								    buf = io.BytesIO(data)
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    started_cluster.minio_client.put_object(bucket, filename, buf, len(data))
-												Added test for GZIP in S3 storage.

											
										
										
											2020-09-28 23:30:41 +00:00
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								# Returns content of given S3 file as string.
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								def get_s3_file_content(started_cluster, bucket, filename, decode=True):
-												more

											
										
										
											2021-06-02 15:08:16 +00:00
+								    # type: (ClickHouseCluster, str, str, bool) -> str
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    data = started_cluster.minio_client.get_object(bucket, filename)
-												Convert to python3 (#15007)


											
										
										
											2020-10-02 16:54:07 +00:00
+								    data_str = b""
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								    for chunk in data.stream():
 								        data_str += chunk
-												Convert to python3 (#15007)


											
										
										
											2020-10-02 16:54:07 +00:00
+								    if decode:
 								        return data_str.decode()
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								    return data_str
-												Tests decomposition.

											
										
										
											2019-09-19 09:34:33 +00:00
-												Attempt to make integration tests.

											
										
										
											2019-06-26 00:41:14 +00:00
+								@pytest.fixture(scope="module")
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								def started_cluster():
-												Attempt to make integration tests.

											
										
										
											2019-06-26 00:41:14 +00:00
+								    try:
 								        cluster = ClickHouseCluster(__file__)
-												Add default credentials and custom headers for s3 table functions.

											
										
										
											2020-06-01 17:16:09 +00:00
+								        cluster.add_instance("restricted_dummy", main_configs=["configs/config_for_test_remote_host_filter.xml"],
 								                             with_minio=True)
-												Tests, url table function

											
										
										
											2021-09-08 19:28:22 +00:00
+								        cluster.add_instance("dummy", with_minio=True, main_configs=["configs/defaultS3.xml", "configs/named_collections.xml"])
-												recreate S3 client if credentials changed

											
										
										
											2021-03-04 15:56:55 +00:00
+								        cluster.add_instance("s3_max_redirects", with_minio=True, main_configs=["configs/defaultS3.xml"],
 								                             user_configs=["configs/s3_max_redirects.xml"])
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								        logging.info("Starting cluster...")
-												Attempt to make integration tests.

											
										
										
											2019-06-26 00:41:14 +00:00
+								        cluster.start()
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								        logging.info("Cluster started")
-												Tests decomposition.

											
										
										
											2019-09-19 09:34:33 +00:00
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								        prepare_s3_bucket(cluster)
 								        logging.info("S3 bucket created")
-												Added tests.

											
										
										
											2021-04-12 08:55:54 +00:00
+								        run_s3_mocks(cluster)
-												Tests decomposition.

											
										
										
											2019-09-19 09:34:33 +00:00
-												Attempt to make integration tests.

											
										
										
											2019-06-26 00:41:14 +00:00
+								        yield cluster
 								    finally:
 								        cluster.shutdown()
-												Fixes.

											
										
										
											2019-09-24 10:58:42 +00:00
+								def run_query(instance, query, stdin=None, settings=None):
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								    # type: (ClickHouseInstance, str, object, dict) -> str
-												Minor review fixes.

											
										
										
											2019-09-22 10:42:47 +00:00
+								    logging.info("Running query '{}'...".format(query))
-												Fixes.

											
										
										
											2019-09-24 10:58:42 +00:00
+								    result = instance.query(query, stdin=stdin, settings=settings)
-												Minor review fixes.

											
										
										
											2019-09-22 10:42:47 +00:00
+								    logging.info("Query finished")
-												Attempt to make integration tests.

											
										
										
											2019-06-26 00:41:14 +00:00
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								    return result
-												Minor review fixes.

											
										
										
											2019-09-22 10:42:47 +00:00
-												Improved test.

											
										
										
											2021-03-02 16:53:03 +00:00
+								# Test simple put. Also checks that wrong credentials produce an error with every compression method.
-												Added test for put with compression and wrong credentials.

											
										
										
											2021-03-02 02:43:19 +00:00
+								@pytest.mark.parametrize("maybe_auth,positive,compression", [
-												wip

											
										
										
											2021-04-12 07:03:12 +00:00
+								    pytest.param("", True, 'auto', id="positive"),
 								    pytest.param("'minio','minio123',", True, 'auto', id="auth_positive"),
-												better

											
										
										
											2021-04-29 14:26:41 +00:00
+								    pytest.param("'wrongid','wrongkey',", False, 'auto', id="auto"),
 								    pytest.param("'wrongid','wrongkey',", False, 'gzip', id="gzip"),
 								    pytest.param("'wrongid','wrongkey',", False, 'deflate', id="deflate"),
 								    pytest.param("'wrongid','wrongkey',", False, 'brotli', id="brotli"),
 								    pytest.param("'wrongid','wrongkey',", False, 'xz', id="xz"),
 								    pytest.param("'wrongid','wrongkey',", False, 'zstd', id="zstd")
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
+								])
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-04-08 09:30:24 +00:00
+								def test_put(started_cluster, maybe_auth, positive, compression):
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								    # type: (ClickHouseCluster) -> None
-												Tests decomposition.

											
										
										
											2019-09-19 09:34:33 +00:00
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    bucket = started_cluster.minio_bucket if not maybe_auth else started_cluster.minio_restricted_bucket
 								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
-												Minor review fixes.

											
										
										
											2019-09-22 10:42:47 +00:00
+								    values = "(1, 2, 3), (3, 2, 1), (78, 43, 45)"
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								    values_csv = "1,2,3\n3,2,1\n78,43,45\n"
 								    filename = "test.csv"
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-04-14 11:21:40 +00:00
+								    put_query = f"""insert into table function s3('http://{started_cluster.minio_ip}:{started_cluster.minio_port}/{bucket}/{filename}',
-												Added test for put with compression and wrong credentials.

											
										
										
											2021-03-02 02:43:19 +00:00
+								                    {maybe_auth}'CSV', '{table_format}', {compression}) values {values}"""
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
+								    try:
 								        run_query(instance, put_query)
 								    except helpers.client.QueryRuntimeException:
-												AWS S3 SDK integration.

											
										
										
											2019-12-03 16:23:24 +00:00
+								        if positive:
 								            raise
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
+								    else:
 								        assert positive
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								        assert values_csv == get_s3_file_content(started_cluster, bucket, filename)
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
-												Validate uft8 in partition key from PARTITION BY for s3

											
										
										
											2021-08-19 11:05:15 +00:00
+								def test_partition_by(started_cluster):
-												Fixed test.

											
										
										
											2021-07-29 01:46:41 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
-												Test.

											
										
										
											2021-05-27 06:14:12 +00:00
+								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
-												Validate uft8 in partition key from PARTITION BY for s3

											
										
										
											2021-08-19 11:05:15 +00:00
+								    partition_by = "column3"
-												Test.

											
										
										
											2021-05-27 06:14:12 +00:00
+								    values = "(1, 2, 3), (3, 2, 1), (78, 43, 45)"
 								    filename = "test_{_partition_id}.csv"
-												Validate uft8 in partition key from PARTITION BY for s3

											
										
										
											2021-08-19 11:05:15 +00:00
+								    put_query = f"""INSERT INTO TABLE FUNCTION
 								        s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'CSV', '{table_format}')
 								        PARTITION BY {partition_by} VALUES {values}"""
-												Test.

											
										
										
											2021-05-27 06:14:12 +00:00
-												Fixes.

											
										
										
											2021-05-31 08:46:28 +00:00
+								    run_query(instance, put_query)
-												Test fix.

											
										
										
											2021-07-26 03:01:49 +00:00
+								    assert "1,2,3\n" == get_s3_file_content(started_cluster, bucket, "test_3.csv")
 								    assert "3,2,1\n" == get_s3_file_content(started_cluster, bucket, "test_1.csv")
 								    assert "78,43,45\n" == get_s3_file_content(started_cluster, bucket, "test_45.csv")
-												Test.

											
										
										
											2021-05-27 06:14:12 +00:00
-												For storage

											
										
										
											2021-10-26 12:22:13 +00:00
+								    filename = "test2_{_partition_id}.csv"
 								    instance.query(f"create table p ({table_format}) engine=S3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'CSV') partition by column3")
 								    instance.query(f"insert into p values {values}")
 								    assert "1,2,3\n" == get_s3_file_content(started_cluster, bucket, "test2_3.csv")
 								    assert "3,2,1\n" == get_s3_file_content(started_cluster, bucket, "test2_1.csv")
 								    assert "78,43,45\n" == get_s3_file_content(started_cluster, bucket, "test2_45.csv")
-												Test.

											
										
										
											2021-05-27 06:14:12 +00:00
-												Validate uft8 in partition key from PARTITION BY for s3

											
										
										
											2021-08-19 11:05:15 +00:00
+								def test_partition_by_string_column(started_cluster):
 								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
 								    table_format = "col_num UInt32, col_str String"
 								    partition_by = "col_str"
-												Do not allow slashes in bucket formatted from PARTITION BY

											
										
										
											2021-08-19 11:21:21 +00:00
+								    values = "(1, 'foo/bar'), (3, 'йцук'), (78, '你好')"
-												Validate uft8 in partition key from PARTITION BY for s3

											
										
										
											2021-08-19 11:05:15 +00:00
+								    filename = "test_{_partition_id}.csv"
 								    put_query = f"""INSERT INTO TABLE FUNCTION
 								        s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'CSV', '{table_format}')
 								        PARTITION BY {partition_by} VALUES {values}"""
 								    run_query(instance, put_query)
-												Do not allow slashes in bucket formatted from PARTITION BY

											
										
										
											2021-08-19 11:21:21 +00:00
+								    assert '1,"foo/bar"\n' == get_s3_file_content(started_cluster, bucket, "test_foo/bar.csv")
-												Validate uft8 in partition key from PARTITION BY for s3

											
										
										
											2021-08-19 11:05:15 +00:00
+								    assert '3,"йцук"\n' == get_s3_file_content(started_cluster, bucket, "test_йцук.csv")
 								    assert '78,"你好"\n' == get_s3_file_content(started_cluster, bucket, "test_你好.csv")
 								def test_partition_by_const_column(started_cluster):
-												Added integration test.

											
										
										
											2021-07-29 10:56:32 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
 								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
 								    values = "(1, 2, 3), (3, 2, 1), (78, 43, 45)"
-												Validate uft8 in partition key from PARTITION BY for s3

											
										
										
											2021-08-19 11:05:15 +00:00
+								    partition_by = "'88'"
-												Added integration test.

											
										
										
											2021-07-29 10:56:32 +00:00
+								    values_csv = "1,2,3\n3,2,1\n78,43,45\n"
 								    filename = "test_{_partition_id}.csv"
-												Validate uft8 in partition key from PARTITION BY for s3

											
										
										
											2021-08-19 11:05:15 +00:00
+								    put_query = f"""INSERT INTO TABLE FUNCTION
 								        s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'CSV', '{table_format}')
 								        PARTITION BY {partition_by} VALUES {values}"""
-												Added integration test.

											
										
										
											2021-07-29 10:56:32 +00:00
 								    run_query(instance, put_query)
 								    assert values_csv == get_s3_file_content(started_cluster, bucket, "test_88.csv")
-												Fixed bug with S3 URLs containing `+` symbol, data with such keys could not be read previously.

											
										
										
											2021-05-04 06:25:33 +00:00
+								@pytest.mark.parametrize("special", [
 								    "space",
 								    "plus"
 								])
-												s3 catch up

											
										
										
											2021-05-12 07:03:53 +00:00
+								def test_get_file_with_special(started_cluster, special):
-												Fixed bug with S3 URLs containing `+` symbol, data with such keys could not be read previously.

											
										
										
											2021-05-04 06:25:33 +00:00
+								    symbol = {"space": " ", "plus": "+"}[special]
 								    urlsafe_symbol = {"space": "%20", "plus": "%2B"}[special]
 								    auth = "'minio','minio123',"
-												s3 catch up

											
										
										
											2021-05-12 07:03:53 +00:00
+								    bucket = started_cluster.minio_restricted_bucket
 								    instance = started_cluster.instances["dummy"]
-												Fixed bug with S3 URLs containing `+` symbol, data with such keys could not be read previously.

											
										
										
											2021-05-04 06:25:33 +00:00
+								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
 								    values = [[12549, 2463, 19893], [64021, 38652, 66703], [81611, 39650, 83516], [11079, 59507, 61546], [51764, 69952, 6876], [41165, 90293, 29095], [40167, 78432, 48309], [81629, 81327, 11855], [55852, 21643, 98507], [6738, 54643, 41155]]
 								    values_csv = ('\n'.join((','.join(map(str, row)) for row in values)) + '\n').encode()
 								    filename = f"get_file_with_{special}_{symbol}two.csv"
-												s3 catch up

											
										
										
											2021-05-12 07:03:53 +00:00
+								    put_s3_file_content(started_cluster, bucket, filename, values_csv)
-												Fixed bug with S3 URLs containing `+` symbol, data with such keys could not be read previously.

											
										
										
											2021-05-04 06:25:33 +00:00
-												s3 catch up

											
										
										
											2021-05-12 07:03:53 +00:00
+								    get_query = f"SELECT * FROM s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/get_file_with_{special}_{urlsafe_symbol}two.csv', {auth}'CSV', '{table_format}') FORMAT TSV"
-												Fixed bug with S3 URLs containing `+` symbol, data with such keys could not be read previously.

											
										
										
											2021-05-04 06:25:33 +00:00
+								    assert [list(map(int, l.split())) for l in run_query(instance, get_query).splitlines()] == values
-												s3 catch up

											
										
										
											2021-05-12 07:03:53 +00:00
+								    get_query = f"SELECT * FROM s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/get_file_with_{special}*.csv', {auth}'CSV', '{table_format}') FORMAT TSV"
-												Fixed bug with S3 URLs containing `+` symbol, data with such keys could not be read previously.

											
										
										
											2021-05-04 06:25:33 +00:00
+								    assert [list(map(int, l.split())) for l in run_query(instance, get_query).splitlines()] == values
-												s3 catch up

											
										
										
											2021-05-12 07:03:53 +00:00
+								    get_query = f"SELECT * FROM s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/get_file_with_{special}_{urlsafe_symbol}*.csv', {auth}'CSV', '{table_format}') FORMAT TSV"
-												Fixed bug with S3 URLs containing `+` symbol, data with such keys could not be read previously.

											
										
										
											2021-05-04 06:25:33 +00:00
+								    assert [list(map(int, l.split())) for l in run_query(instance, get_query).splitlines()] == values
 								@pytest.mark.parametrize("special", [
 								    "space",
 								    "plus",
 								    "plus2"
 								])
-												s3 catch up

											
										
										
											2021-05-12 07:03:53 +00:00
+								def test_get_path_with_special(started_cluster, special):
-												Fixed bug with S3 URLs containing `+` symbol, data with such keys could not be read previously.

											
										
										
											2021-05-04 06:25:33 +00:00
+								    symbol = {"space": "%20", "plus": "%2B", "plus2": "%2B"}[special]
 								    safe_symbol = {"space": "%20", "plus": "+", "plus2": "%2B"}[special]
 								    auth = "'minio','minio123',"
 								    table_format = "column1 String"
-												s3 catch up

											
										
										
											2021-05-12 07:03:53 +00:00
+								    instance = started_cluster.instances["dummy"]
-												Fixed bug with S3 URLs containing `+` symbol, data with such keys could not be read previously.

											
										
										
											2021-05-04 06:25:33 +00:00
+								    get_query = f"SELECT * FROM s3('http://resolver:8082/get-my-path/{safe_symbol}.csv', {auth}'CSV', '{table_format}') FORMAT TSV"
 								    assert run_query(instance, get_query).splitlines() == [f"/{symbol}.csv"]
-												Abort multipart upload if no data was written to WriteBufferFromS3. (#16840)


											
										
										
											2020-11-11 12:15:16 +00:00
+								# Test put no data to S3.
 								@pytest.mark.parametrize("auth", [
-												more

											
										
										
											2021-04-13 10:52:22 +00:00
+								    pytest.param("'minio','minio123',", id="minio")
-												Abort multipart upload if no data was written to WriteBufferFromS3. (#16840)


											
										
										
											2020-11-11 12:15:16 +00:00
+								])
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								def test_empty_put(started_cluster, auth):
-												more

											
										
										
											2021-06-02 15:08:16 +00:00
+								    # type: (ClickHouseCluster, str) -> None
-												Abort multipart upload if no data was written to WriteBufferFromS3. (#16840)


											
										
										
											2020-11-11 12:15:16 +00:00
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
-												Abort multipart upload if no data was written to WriteBufferFromS3. (#16840)


											
										
										
											2020-11-11 12:15:16 +00:00
+								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
-												better test_storage_s3, test_storage_s3_get_unstable still not OK

											
										
										
											2021-07-15 12:49:25 +00:00
+								    drop_empty_table_query = "DROP TABLE IF EXISTS empty_table"
-												Abort multipart upload if no data was written to WriteBufferFromS3. (#16840)


											
										
										
											2020-11-11 12:15:16 +00:00
+								    create_empty_table_query = """
 								        CREATE TABLE empty_table (
 								        {}
 								        ) ENGINE = Null()
 								    """.format(table_format)
-												better test_storage_s3, test_storage_s3_get_unstable still not OK

											
										
										
											2021-07-15 12:49:25 +00:00
+								    run_query(instance, drop_empty_table_query)
-												Abort multipart upload if no data was written to WriteBufferFromS3. (#16840)


											
										
										
											2020-11-11 12:15:16 +00:00
+								    run_query(instance, create_empty_table_query)
 								    filename = "empty_put_test.csv"
 								    put_query = "insert into table function s3('http://{}:{}/{}/{}', {}'CSV', '{}') select * from empty_table".format(
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-04-14 11:21:40 +00:00
+								        started_cluster.minio_ip, MINIO_INTERNAL_PORT, bucket, filename, auth, table_format)
-												Abort multipart upload if no data was written to WriteBufferFromS3. (#16840)


											
										
										
											2020-11-11 12:15:16 +00:00
 								    run_query(instance, put_query)
 								    try:
 								        run_query(instance, "select count(*) from s3('http://{}:{}/{}/{}', {}'CSV', '{}')".format(
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-04-14 11:21:40 +00:00
+								            started_cluster.minio_ip, MINIO_INTERNAL_PORT, bucket, filename, auth, table_format))
-												Abort multipart upload if no data was written to WriteBufferFromS3. (#16840)


											
										
										
											2020-11-11 12:15:16 +00:00
 								        assert False, "Query should be failed."
 								    except helpers.client.QueryRuntimeException as e:
 								        assert str(e).find("The specified key does not exist") != 0
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								# Test put values in CSV format.
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
+								@pytest.mark.parametrize("maybe_auth,positive", [
-												wip

											
										
										
											2021-04-12 07:03:12 +00:00
+								    pytest.param("", True, id="positive"),
 								    pytest.param("'minio','minio123',", True, id="auth_positive"),
 								    pytest.param("'wrongid','wrongkey',", False, id="negative"),
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
+								])
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								def test_put_csv(started_cluster, maybe_auth, positive):
-												more

											
										
										
											2021-06-02 15:08:16 +00:00
+								    # type: (ClickHouseCluster, bool, str) -> None
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    bucket = started_cluster.minio_bucket if not maybe_auth else started_cluster.minio_restricted_bucket
 								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
 								    filename = "test.csv"
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
+								    put_query = "insert into table function s3('http://{}:{}/{}/{}', {}'CSV', '{}') format CSV".format(
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-04-14 11:21:40 +00:00
+								        started_cluster.minio_ip, MINIO_INTERNAL_PORT, bucket, filename, maybe_auth, table_format)
-												Minor review fixes.

											
										
										
											2019-09-22 10:42:47 +00:00
+								    csv_data = "8,9,16\n11,18,13\n22,14,2\n"
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
+								    try:
 								        run_query(instance, put_query, stdin=csv_data)
 								    except helpers.client.QueryRuntimeException:
-												AWS S3 SDK integration.

											
										
										
											2019-12-03 16:23:24 +00:00
+								        if positive:
 								            raise
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
+								    else:
 								        assert positive
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								        assert csv_data == get_s3_file_content(started_cluster, bucket, filename)
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
 								# Test put and get with S3 server redirect.
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								def test_put_get_with_redirect(started_cluster):
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								    # type: (ClickHouseCluster) -> None
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
 								    values = "(1, 1, 1), (1, 1, 1), (11, 11, 11)"
 								    values_csv = "1,1,1\n1,1,1\n11,11,11\n"
 								    filename = "test.csv"
 								    query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format(
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								        started_cluster.minio_redirect_host, started_cluster.minio_redirect_port, bucket, filename, table_format, values)
-												Tests decomposition.

											
										
										
											2019-09-19 09:34:33 +00:00
+								    run_query(instance, query)
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    assert values_csv == get_s3_file_content(started_cluster, bucket, filename)
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
 								    query = "select *, column1*column2*column3 from s3('http://{}:{}/{}/{}', 'CSV', '{}')".format(
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								        started_cluster.minio_redirect_host, started_cluster.minio_redirect_port, bucket, filename, table_format)
-												Tests decomposition.

											
										
										
											2019-09-19 09:34:33 +00:00
+								    stdout = run_query(instance, query)
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
-												Tests decomposition.

											
										
										
											2019-09-19 09:34:33 +00:00
+								    assert list(map(str.split, stdout.splitlines())) == [
-												Minor review fixes.

											
										
										
											2019-09-22 10:42:47 +00:00
+								        ["1", "1", "1", "1"],
 								        ["1", "1", "1", "1"],
 								        ["11", "11", "11", "1331"],
-												Attempt to make integration tests.

											
										
										
											2019-06-26 00:41:14 +00:00
+								    ]
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
-												Use only 's3_max_redirect' in params instead of all settings

											
										
										
											2020-11-23 11:02:17 +00:00
+								# Test put with restricted S3 server redirect.
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								def test_put_with_zero_redirect(started_cluster):
-												Update test.py
											
										
										
											2021-06-21 16:07:17 +00:00
+								    # type: (ClickHouseCluster) -> None
-												Add 's3_max_redirects' test

											
										
										
											2020-11-20 08:18:44 +00:00
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    bucket = started_cluster.minio_bucket
-												Update test.py
											
										
										
											2021-06-21 16:07:17 +00:00
+								    instance = started_cluster.instances["s3_max_redirects"]  # type: ClickHouseInstance
 								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
-												Add 's3_max_redirects' test

											
										
										
											2020-11-20 08:18:44 +00:00
+								    values = "(1, 1, 1), (1, 1, 1), (11, 11, 11)"
 								    filename = "test.csv"
-												Update test.py
											
										
										
											2021-06-21 16:07:17 +00:00
+								    # Should work without redirect
 								    query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format(
 								        started_cluster.minio_ip, MINIO_INTERNAL_PORT, bucket, filename, table_format, values)
-												Add 's3_max_redirects' test

											
										
										
											2020-11-20 08:18:44 +00:00
+								    run_query(instance, query)
-												Update test.py
											
										
										
											2021-06-21 16:07:17 +00:00
+								    # Should not work with redirect
 								    query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format(
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								        started_cluster.minio_redirect_host, started_cluster.minio_redirect_port, bucket, filename, table_format, values)
-												Update test.py
											
										
										
											2021-06-21 16:07:17 +00:00
+								    exception_raised = False
-												Add 's3_max_redirects' test

											
										
										
											2020-11-20 08:18:44 +00:00
+								    try:
 								        run_query(instance, query)
-												Update test.py
											
										
										
											2021-06-21 16:07:17 +00:00
+								    except Exception as e:
 								        assert str(e).find("Too many redirects while trying to access") != -1
 								        exception_raised = True
-												Add 's3_max_redirects' test

											
										
										
											2020-11-20 08:18:44 +00:00
+								    finally:
 								        assert exception_raised
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								def test_put_get_with_globs(started_cluster):
-												Added some tests.

											
										
										
											2020-01-27 21:44:18 +00:00
+								    # type: (ClickHouseCluster) -> None
-												better test_storage_s3, test_storage_s3_get_unstable still not OK

											
										
										
											2021-07-15 12:49:25 +00:00
+								    unique_prefix = random.randint(1,10000)
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
-												Added some tests.

											
										
										
											2020-01-27 21:44:18 +00:00
+								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
-												Added tests for `_file` and `_path` in S3 storage.

											
										
										
											2020-01-27 22:09:21 +00:00
+								    max_path = ""
-												Added some tests.

											
										
										
											2020-01-27 21:44:18 +00:00
+								    for i in range(10):
 								        for j in range(10):
-												better test_storage_s3, test_storage_s3_get_unstable still not OK

											
										
										
											2021-07-15 12:49:25 +00:00
+								            path = "{}/{}_{}/{}.csv".format(unique_prefix, i, random.choice(['a', 'b', 'c', 'd']), j)
-												Added tests for `_file` and `_path` in S3 storage.

											
										
										
											2020-01-27 22:09:21 +00:00
+								            max_path = max(path, max_path)
-												Add default credentials and custom headers for s3 table functions.

											
										
										
											2020-06-01 17:16:09 +00:00
+								            values = "({},{},{})".format(i, j, i + j)
-												Added some tests.

											
										
										
											2020-01-27 21:44:18 +00:00
+								            query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format(
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-04-14 11:21:40 +00:00
+								                started_cluster.minio_ip, MINIO_INTERNAL_PORT, bucket, path, table_format, values)
-												Added some tests.

											
										
										
											2020-01-27 21:44:18 +00:00
+								            run_query(instance, query)
-												better test_storage_s3, test_storage_s3_get_unstable still not OK

											
										
										
											2021-07-15 12:49:25 +00:00
+								    query = "select sum(column1), sum(column2), sum(column3), min(_file), max(_path) from s3('http://{}:{}/{}/{}/*_{{a,b,c,d}}/%3f.csv', 'CSV', '{}')".format(
 								        started_cluster.minio_redirect_host, started_cluster.minio_redirect_port, bucket, unique_prefix, table_format)
-												Add default credentials and custom headers for s3 table functions.

											
										
										
											2020-06-01 17:16:09 +00:00
+								    assert run_query(instance, query).splitlines() == [
 								        "450\t450\t900\t0.csv\t{bucket}/{max_path}".format(bucket=bucket, max_path=max_path)]
-												Added some tests.

											
										
										
											2020-01-27 21:44:18 +00:00
-												Fix test_storage_s3/test_put_get_with_globs (cleanup after test)

											
										
										
											2021-08-30 10:31:46 +00:00
+								    minio = started_cluster.minio_client
 								    for obj in list(minio.list_objects(started_cluster.minio_bucket, prefix='{}/'.format(unique_prefix), recursive=True)):
 								        minio.remove_object(started_cluster.minio_bucket, obj.object_name)
-												Added some tests.

											
										
										
											2020-01-27 21:44:18 +00:00
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								# Test multipart put.
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
+								@pytest.mark.parametrize("maybe_auth,positive", [
-												wip

											
										
										
											2021-04-12 07:03:12 +00:00
+								    pytest.param("", True, id="positive"),
 								    pytest.param("'wrongid','wrongkey'", False, id="negative"),
-												AWS S3 SDK integration.

											
										
										
											2019-12-03 16:23:24 +00:00
+								    # ("'minio','minio123',",True), Redirect with credentials not working with nginx.
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
+								])
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								def test_multipart_put(started_cluster, maybe_auth, positive):
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								    # type: (ClickHouseCluster) -> None
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    bucket = started_cluster.minio_bucket if not maybe_auth else started_cluster.minio_restricted_bucket
 								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
 								    # Minimum size of part is 5 Mb for Minio.
 								    # See: https://github.com/minio/minio/blob/master/docs/minio-limits.md
-												Ensure multipart upload works in S3 storage tests.

											
										
										
											2019-11-21 13:13:38 +00:00
+								    min_part_size_bytes = 5 * 1024 * 1024
 								    csv_size_bytes = int(min_part_size_bytes * 1.5)  # To have 2 parts.
 								    one_line_length = 6  # 3 digits, 2 commas, 1 line separator.
 								    # Generate data having size more than one part
-												Convert to python3 (#15007)


											
										
										
											2020-10-02 16:54:07 +00:00
+								    int_data = [[1, 2, 3] for i in range(csv_size_bytes // one_line_length)]
-												Ensure multipart upload works in S3 storage tests.

											
										
										
											2019-11-21 13:13:38 +00:00
+								    csv_data = "".join(["{},{},{}\n".format(x, y, z) for x, y, z in int_data])
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
-												Ensure multipart upload works in S3 storage tests.

											
										
										
											2019-11-21 13:13:38 +00:00
+								    assert len(csv_data) > min_part_size_bytes
 								    filename = "test_multipart.csv"
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
+								    put_query = "insert into table function s3('http://{}:{}/{}/{}', {}'CSV', '{}') format CSV".format(
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								        started_cluster.minio_redirect_host, started_cluster.minio_redirect_port, bucket, filename, maybe_auth, table_format)
-												Ensure multipart upload works in S3 storage tests.

											
										
										
											2019-11-21 13:13:38 +00:00
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
+								    try:
-												Adaptive choose of single/multi part upload in WriteBufferFromS3.

											
										
										
											2020-12-09 14:09:04 +00:00
+								        run_query(instance, put_query, stdin=csv_data, settings={'s3_min_upload_part_size': min_part_size_bytes,
 								                                                                 's3_max_single_part_upload_size': 0})
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
+								    except helpers.client.QueryRuntimeException:
-												AWS S3 SDK integration.

											
										
										
											2019-12-03 16:23:24 +00:00
+								        if positive:
 								            raise
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
+								    else:
 								        assert positive
-												S3 HTTPS integration test.

											
										
										
											2020-07-10 19:42:18 +00:00
+								        # Use proxy access logs to count number of parts uploaded to Minio.
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								        proxy_logs = started_cluster.get_container_logs("proxy1")  # type: str
-												S3 HTTPS integration test.

											
										
										
											2020-07-10 19:42:18 +00:00
+								        assert proxy_logs.count("PUT /{}/{}".format(bucket, filename)) >= 2
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								        assert csv_data == get_s3_file_content(started_cluster, bucket, filename)
-												Added integration test for storage_s3

											
										
										
											2019-11-06 17:06:50 +00:00
-												Fix test

											
										
										
											2019-12-03 17:36:02 +00:00
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								def test_remote_host_filter(started_cluster):
 								    instance = started_cluster.instances["restricted_dummy"]
-												Added integration test for storage_s3

											
										
										
											2019-11-06 17:06:50 +00:00
+								    format = "column1 UInt32, column2 UInt32, column3 UInt32"
-												AWS SDK integration - fixed test with remote host filter.

											
										
										
											2019-12-09 12:05:16 +00:00
+								    query = "select *, column1*column2*column3 from s3('http://{}:{}/{}/test.csv', 'CSV', '{}')".format(
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								        "invalid_host", MINIO_INTERNAL_PORT, started_cluster.minio_bucket, format)
-												Fixed tests

											
										
										
											2021-10-29 22:29:36 +00:00
+								    assert "not allowed in configuration file" in instance.query_and_get_error(query)
-												Added integration test for storage_s3

											
										
										
											2019-11-06 17:06:50 +00:00
 								    other_values = "(1, 1, 1), (1, 1, 1), (11, 11, 11)"
-												AWS SDK integration - fixed test with remote host filter.

											
										
										
											2019-12-09 12:05:16 +00:00
+								    query = "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') values {}".format(
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								        "invalid_host", MINIO_INTERNAL_PORT, started_cluster.minio_bucket, format, other_values)
-												Fixed tests

											
										
										
											2021-10-29 22:29:36 +00:00
+								    assert "not allowed in configuration file" in instance.query_and_get_error(query)
-												AWS SDK integration - move s3 storage syntax tests to integration.

											
										
										
											2019-12-10 16:11:13 +00:00
 								@pytest.mark.parametrize("s3_storage_args", [
-												wip

											
										
										
											2021-04-12 07:03:12 +00:00
+								    pytest.param("''", id="1_argument"),
 								    pytest.param("'','','','','',''", id="6_arguments"),
-												AWS SDK integration - move s3 storage syntax tests to integration.

											
										
										
											2019-12-10 16:11:13 +00:00
+								])
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								def test_wrong_s3_syntax(started_cluster, s3_storage_args):
 								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
-												AWS SDK integration - move s3 storage syntax tests to integration.

											
										
										
											2019-12-10 16:11:13 +00:00
+								    expected_err_msg = "Code: 42"  # NUMBER_OF_ARGUMENTS_DOESNT_MATCH
 								    query = "create table test_table_s3_syntax (id UInt32) ENGINE = S3({})".format(s3_storage_args)
 								    assert expected_err_msg in instance.query_and_get_error(query)
-												Added test for multi-page S3 globbing.

											
										
										
											2020-05-25 09:15:11 +00:00
-												Update test.py
											
										
										
											2020-05-25 21:05:15 +00:00
+								# https://en.wikipedia.org/wiki/One_Thousand_and_One_Nights
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								def test_s3_glob_scheherazade(started_cluster):
 								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
-												Added test for multi-page S3 globbing.

											
										
										
											2020-05-25 09:15:11 +00:00
+								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
 								    max_path = ""
 								    values = "(1, 1, 1)"
 								    nights_per_job = 1001 // 30
 								    jobs = []
 								    for night in range(0, 1001, nights_per_job):
 								        def add_tales(start, end):
 								            for i in range(start, end):
 								                path = "night_{}/tale.csv".format(i)
 								                query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format(
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-04-14 11:21:40 +00:00
+								                    started_cluster.minio_ip, MINIO_INTERNAL_PORT, bucket, path, table_format, values)
-												Added test for multi-page S3 globbing.

											
										
										
											2020-05-25 09:15:11 +00:00
+								                run_query(instance, query)
-												Format and optimize imports in integration test files

This PR formats all the `*.py` files found under the `tests/integration`
folder. It also reorders the imports and cleans up a bunch of unused
imports.

The formatting also takes care of other things like wrapping lines and
fixing spaces and indents such that the tests look more readable.

											
										
										
											2020-09-16 04:26:10 +00:00
+								        jobs.append(threading.Thread(target=add_tales, args=(night, min(night + nights_per_job, 1001))))
-												Added test for multi-page S3 globbing.

											
										
										
											2020-05-25 09:15:11 +00:00
+								        jobs[-1].start()
 								    for job in jobs:
 								        job.join()
 								    query = "select count(), sum(column1), sum(column2), sum(column3) from s3('http://{}:{}/{}/night_*/tale.csv', 'CSV', '{}')".format(
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								        started_cluster.minio_redirect_host, started_cluster.minio_redirect_port, bucket, table_format)
-												Added test for multi-page S3 globbing.

											
										
										
											2020-05-25 09:15:11 +00:00
+								    assert run_query(instance, query).splitlines() == ["1001\t1001\t1001\t1001"]
-												Add default credentials and custom headers for s3 table functions.

											
										
										
											2020-06-01 17:16:09 +00:00
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-04-27 15:34:33 +00:00
 								def run_s3_mocks(started_cluster):
-												Added tests.

											
										
										
											2021-04-12 08:55:54 +00:00
+								    logging.info("Starting s3 mocks")
 								    mocks = (
 								        ("mock_s3.py", "resolver", "8080"),
 								        ("unstable_server.py", "resolver", "8081"),
-												Fixed bug with S3 URLs containing `+` symbol, data with such keys could not be read previously.

											
										
										
											2021-05-04 06:25:33 +00:00
+								        ("echo.py", "resolver", "8082"),
-												Added tests.

											
										
										
											2021-04-12 08:55:54 +00:00
+								    )
 								    for mock_filename, container, port in mocks:
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-04-27 15:34:33 +00:00
+								        container_id = started_cluster.get_container_id(container)
-												Added tests.

											
										
										
											2021-04-12 08:55:54 +00:00
+								        current_dir = os.path.dirname(__file__)
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-04-27 15:34:33 +00:00
+								        started_cluster.copy_file_to_container(container_id, os.path.join(current_dir, "s3_mocks", mock_filename), mock_filename)
 								        started_cluster.exec_in_container(container_id, ["python", mock_filename, port], detach=True)
-												Added tests.

											
										
										
											2021-04-12 08:55:54 +00:00
 								    # Wait for S3 mocks to start
 								    for mock_filename, container, port in mocks:
-												Maybe Minio starts for too long in tests

											
										
										
											2021-06-06 09:38:49 +00:00
+								        num_attempts = 100
 								        for attempt in range(num_attempts):
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-04-27 15:34:33 +00:00
+								            ping_response = started_cluster.exec_in_container(started_cluster.get_container_id(container),
-												try

											
										
										
											2021-06-01 14:18:35 +00:00
+								                                                              ["curl", "-s", f"http://localhost:{port}/"], nothrow=True)
-												Added tests.

											
										
										
											2021-04-12 08:55:54 +00:00
+								            if ping_response != 'OK':
-												Maybe Minio starts for too long in tests

											
										
										
											2021-06-06 09:38:49 +00:00
+								                if attempt == num_attempts - 1:
-												Added tests.

											
										
										
											2021-04-12 08:55:54 +00:00
+								                    assert ping_response == 'OK', 'Expected "OK", but got "{}"'.format(ping_response)
 								                else:
 								                    time.sleep(1)
-												Fixed flaky test_storage_s3::test_custom_auth_headers

											
										
										
											2020-11-23 10:19:43 +00:00
+								            else:
-												s3 catch up

											
										
										
											2021-05-12 07:03:53 +00:00
+								                logging.debug(f"mock {mock_filename} ({port}) answered {ping_response} on attempt {attempt}")
-												Added tests.

											
										
										
											2021-04-12 08:55:54 +00:00
+								                break
-												Fixed flaky test_storage_s3::test_custom_auth_headers

											
										
										
											2020-11-23 10:19:43 +00:00
-												Added tests.

											
										
										
											2021-04-12 08:55:54 +00:00
+								    logging.info("S3 mocks started")
-												Add default credentials and custom headers for s3 table functions.

											
										
										
											2020-06-01 17:16:09 +00:00
-												recreate S3 client if credentials changed

											
										
										
											2021-03-04 15:56:55 +00:00
+								def replace_config(old, new):
 								    config = open(CONFIG_PATH, 'r')
 								    config_lines = config.readlines()
 								    config.close()
 								    config_lines = [line.replace(old, new) for line in config_lines]
 								    config = open(CONFIG_PATH, 'w')
 								    config.writelines(config_lines)
 								    config.close()
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								def test_custom_auth_headers(started_cluster):
-												Add default credentials and custom headers for s3 table functions.

											
										
										
											2020-06-01 17:16:09 +00:00
+								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
 								    filename = "test.csv"
 								    get_query = "select * from s3('http://resolver:8080/{bucket}/{file}', 'CSV', '{table_format}')".format(
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								        bucket=started_cluster.minio_restricted_bucket,
-												Add default credentials and custom headers for s3 table functions.

											
										
										
											2020-06-01 17:16:09 +00:00
+								        file=filename,
 								        table_format=table_format)
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
-												Add default credentials and custom headers for s3 table functions.

											
										
										
											2020-06-01 17:16:09 +00:00
+								    result = run_query(instance, get_query)
 								    assert result == '1\t2\t3\n'
-												throw exception on redirect limit in S3 request

											
										
										
											2020-07-07 13:20:48 +00:00
-												better test_storage_s3, test_storage_s3_get_unstable still not OK

											
										
										
											2021-07-15 12:49:25 +00:00
+								    instance.query("DROP TABLE IF EXISTS test")
-												recreate S3 client if credentials changed

											
										
										
											2021-03-04 15:56:55 +00:00
+								    instance.query(
 								        "CREATE TABLE test ({table_format}) ENGINE = S3('http://resolver:8080/{bucket}/{file}', 'CSV')".format(
-												fix

											
										
										
											2021-03-26 18:46:42 +00:00
+								            bucket=started_cluster.minio_restricted_bucket,
-												recreate S3 client if credentials changed

											
										
										
											2021-03-04 15:56:55 +00:00
+								            file=filename,
 								            table_format=table_format
 								        ))
 								    assert run_query(instance, "SELECT * FROM test") == '1\t2\t3\n'
 								    replace_config("<header>Authorization: Bearer TOKEN", "<header>Authorization: Bearer INVALID_TOKEN")
 								    instance.query("SYSTEM RELOAD CONFIG")
 								    ret, err = instance.query_and_get_answer_with_error("SELECT * FROM test")
 								    assert ret == "" and err != ""
 								    replace_config("<header>Authorization: Bearer INVALID_TOKEN", "<header>Authorization: Bearer TOKEN")
 								    instance.query("SYSTEM RELOAD CONFIG")
 								    assert run_query(instance, "SELECT * FROM test") == '1\t2\t3\n'
-												better test_storage_s3, test_storage_s3_get_unstable still not OK

											
										
										
											2021-07-15 12:49:25 +00:00
+								    instance.query("DROP TABLE test")
-												recreate S3 client if credentials changed

											
										
										
											2021-03-04 15:56:55 +00:00
-												throw exception on redirect limit in S3 request

											
										
										
											2020-07-07 13:20:48 +00:00
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								def test_custom_auth_headers_exclusion(started_cluster):
-												Added prefix-based S3 endpoint settings.

											
										
										
											2021-01-07 03:42:39 +00:00
+								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
 								    filename = "test.csv"
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    get_query = f"SELECT * FROM s3('http://resolver:8080/{started_cluster.minio_restricted_bucket}/restricteddirectory/{filename}', 'CSV', '{table_format}')"
-												Added prefix-based S3 endpoint settings.

											
										
										
											2021-01-07 03:42:39 +00:00
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
-												Added prefix-based S3 endpoint settings.

											
										
										
											2021-01-07 03:42:39 +00:00
+								    with pytest.raises(helpers.client.QueryRuntimeException) as ei:
 								        result = run_query(instance, get_query)
 								        print(result)
 								    assert ei.value.returncode == 243
-												Fix tests II.

											
										
										
											2021-05-02 10:55:24 +00:00
+								    assert 'Forbidden Error' in ei.value.stderr
-												Added prefix-based S3 endpoint settings.

											
										
										
											2021-01-07 03:42:39 +00:00
-												finally

											
										
										
											2021-04-29 11:57:48 +00:00
 								def test_infinite_redirect(started_cluster):
 								    bucket = "redirected"
 								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
 								    filename = "test.csv"
 								    get_query = f"select * from s3('http://resolver:{started_cluster.minio_redirect_port}/{bucket}/{filename}', 'CSV', '{table_format}')"
 								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
 								    exception_raised = False
 								    try:
 								        run_query(instance, get_query)
 								    except Exception as e:
 								        assert str(e).find("Too many redirects while trying to access") != -1
 								        exception_raised = True
 								    finally:
 								        assert exception_raised
-												Fixed table function S3 `auto` compression mode.

											
										
										
											2021-01-29 04:54:52 +00:00
+								@pytest.mark.parametrize("extension,method", [
-												wip

											
										
										
											2021-04-12 07:03:12 +00:00
+								    pytest.param("bin", "gzip", id="bin"),
 								    pytest.param("gz", "auto", id="gz"),
-												Fixed table function S3 `auto` compression mode.

											
										
										
											2021-01-29 04:54:52 +00:00
+								])
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								def test_storage_s3_get_gzip(started_cluster, extension, method):
 								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
-												Fixed table function S3 `auto` compression mode.

											
										
										
											2021-01-29 04:54:52 +00:00
+								    filename = f"test_get_gzip.{extension}"
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-03-03 08:56:15 +00:00
+								    name = f"test_get_gzip_{extension}"
-												Added test for GZIP in S3 storage.

											
										
										
											2020-09-28 23:30:41 +00:00
+								    data = [
 								        "Sophia Intrieri,55",
 								        "Jack Taylor,71",
 								        "Christopher Silva,66",
 								        "Clifton Purser,35",
 								        "Richard Aceuedo,43",
 								        "Lisa Hensley,31",
 								        "Alice Wehrley,1",
 								        "Mary Farmer,47",
 								        "Samara Ramirez,19",
 								        "Shirley Lloyd,51",
 								        "Santos Cowger,0",
 								        "Richard Mundt,88",
 								        "Jerry Gonzalez,15",
 								        "Angela James,10",
 								        "Norman Ortega,33",
 								        ""
 								    ]
-												better test_storage_s3, test_storage_s3_get_unstable still not OK

											
										
										
											2021-07-15 12:49:25 +00:00
+								    run_query(instance, f"DROP TABLE IF EXISTS {name}")
-												Convert to python3 (#15007)


											
										
										
											2020-10-02 16:54:07 +00:00
+								    buf = io.BytesIO()
-												Added test for GZIP in S3 storage.

											
										
										
											2020-09-28 23:30:41 +00:00
+								    compressed = gzip.GzipFile(fileobj=buf, mode="wb")
-												Convert to python3 (#15007)


											
										
										
											2020-10-02 16:54:07 +00:00
+								    compressed.write(("\n".join(data)).encode())
-												Added test for GZIP in S3 storage.

											
										
										
											2020-09-28 23:30:41 +00:00
+								    compressed.close()
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    put_s3_file_content(started_cluster, bucket, filename, buf.getvalue())
-												Added test for GZIP in S3 storage.

											
										
										
											2020-09-28 23:30:41 +00:00
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-03-03 08:56:15 +00:00
+								    run_query(instance, f"""CREATE TABLE {name} (name String, id UInt32) ENGINE = S3(
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-04-14 11:21:40 +00:00
+								                                'http://{started_cluster.minio_ip}:{MINIO_INTERNAL_PORT}/{bucket}/{filename}',
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-03-03 08:56:15 +00:00
+								                                'CSV',
 								                                '{method}')""")
-												Added test for GZIP in S3 storage.

											
										
										
											2020-09-28 23:30:41 +00:00
-												better test_storage_s3, test_storage_s3_get_unstable still not OK

											
										
										
											2021-07-15 12:49:25 +00:00
+								    run_query(instance, f"SELECT sum(id) FROM {name}").splitlines() == ["565"]
 								    run_query(instance, f"DROP TABLE {name}")
-												Added test for GZIP in S3 storage.

											
										
										
											2020-09-28 23:30:41 +00:00
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-04-27 15:34:33 +00:00
+								def test_storage_s3_get_unstable(started_cluster):
 								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
-												Improved tests.

											
										
										
											2021-04-12 21:38:45 +00:00
+								    table_format = "column1 Int64, column2 Int64, column3 Int64, column4 Int64"
-												Improved `test_storage_s3_get_unstable`.

											
										
										
											2021-05-08 21:55:24 +00:00
+								    get_query = f"SELECT count(), sum(column3), sum(column4) FROM s3('http://resolver:8081/{started_cluster.minio_bucket}/test.csv', 'CSV', '{table_format}') FORMAT CSV"
-												Added tests.

											
										
										
											2021-04-12 08:55:54 +00:00
+								    result = run_query(instance, get_query)
-												Improved `test_storage_s3_get_unstable`.

											
										
										
											2021-05-08 21:55:24 +00:00
+								    assert result.splitlines() == ["500001,500000,0"]
-												Added tests.

											
										
										
											2021-04-12 08:55:54 +00:00
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								def test_storage_s3_put_uncompressed(started_cluster):
 								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
-												Somehow uncompressed PUT works and gzipped PUT doesn't, in S3 storage.

											
										
										
											2020-09-30 12:04:21 +00:00
+								    filename = "test_put_uncompressed.bin"
 								    name = "test_put_uncompressed"
 								    data = [
 								        "'Gloria Thompson',99",
 								        "'Matthew Tang',98",
 								        "'Patsy Anderson',23",
 								        "'Nancy Badillo',93",
 								        "'Roy Hunt',5",
 								        "'Adam Kirk',51",
 								        "'Joshua Douds',28",
 								        "'Jolene Ryan',0",
 								        "'Roxanne Padilla',50",
 								        "'Howard Roberts',41",
 								        "'Ricardo Broughton',13",
 								        "'Roland Speer',83",
 								        "'Cathy Cohan',58",
 								        "'Kathie Dawson',100",
 								        "'Gregg Mcquistion',11",
 								    ]
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-03-03 08:56:15 +00:00
+								    run_query(instance, "CREATE TABLE {} (name String, id UInt32) ENGINE = S3('http://{}:{}/{}/{}', 'CSV')".format(
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-04-14 11:21:40 +00:00
+								        name, started_cluster.minio_ip, MINIO_INTERNAL_PORT, bucket, filename))
-												Somehow uncompressed PUT works and gzipped PUT doesn't, in S3 storage.

											
										
										
											2020-09-30 12:04:21 +00:00
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-03-03 08:56:15 +00:00
+								    run_query(instance, "INSERT INTO {} VALUES ({})".format(name, "),(".join(data)))
-												Fixed bug in GZIP compression in S3 storage.

											
										
										
											2020-09-30 13:09:55 +00:00
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-03-03 08:56:15 +00:00
+								    run_query(instance, "SELECT sum(id) FROM {}".format(name)).splitlines() == ["753"]
-												Somehow uncompressed PUT works and gzipped PUT doesn't, in S3 storage.

											
										
										
											2020-09-30 12:04:21 +00:00
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-03-03 08:56:15 +00:00
+								    uncompressed_content = get_s3_file_content(started_cluster, bucket, filename)
 								    assert sum([ int(i.split(',')[1]) for i in uncompressed_content.splitlines() ]) == 753
-												Somehow uncompressed PUT works and gzipped PUT doesn't, in S3 storage.

											
										
										
											2020-09-30 12:04:21 +00:00
-												Fixed table function S3 `auto` compression mode.

											
										
										
											2021-01-29 04:54:52 +00:00
+								@pytest.mark.parametrize("extension,method", [
-												wip

											
										
										
											2021-04-12 07:03:12 +00:00
+								    pytest.param("bin", "gzip", id="bin"),
 								    pytest.param("gz", "auto", id="gz")
-												Fixed table function S3 `auto` compression mode.

											
										
										
											2021-01-29 04:54:52 +00:00
+								])
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								def test_storage_s3_put_gzip(started_cluster, extension, method):
 								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
-												Fixed table function S3 `auto` compression mode.

											
										
										
											2021-01-29 04:54:52 +00:00
+								    filename = f"test_put_gzip.{extension}"
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-03-03 08:56:15 +00:00
+								    name = f"test_put_gzip_{extension}"
-												Added test for GZIP in S3 storage.

											
										
										
											2020-09-28 23:30:41 +00:00
+								    data = [
 								        "'Joseph Tomlinson',5",
 								        "'Earnest Essary',44",
 								        "'Matha Pannell',24",
 								        "'Michael Shavers',46",
 								        "'Elias Groce',38",
 								        "'Pamela Bramlet',50",
 								        "'Lewis Harrell',49",
 								        "'Tamara Fyall',58",
 								        "'George Dixon',38",
 								        "'Alice Walls',49",
 								        "'Paula Mais',24",
 								        "'Myrtle Pelt',93",
 								        "'Sylvia Naffziger',18",
 								        "'Amanda Cave',83",
 								        "'Yolanda Joseph',89"
 								    ]
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-03-03 08:56:15 +00:00
+								    run_query(instance, f"""CREATE TABLE {name} (name String, id UInt32) ENGINE = S3(
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-04-14 11:21:40 +00:00
+								                                'http://{started_cluster.minio_ip}:{MINIO_INTERNAL_PORT}/{bucket}/{filename}',
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-03-03 08:56:15 +00:00
+								                                'CSV',
 								                                '{method}')""")
-												Added test for GZIP in S3 storage.

											
										
										
											2020-09-28 23:30:41 +00:00
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-03-03 08:56:15 +00:00
+								    run_query(instance, f"INSERT INTO {name} VALUES ({'),('.join(data)})")
-												Fixed bug in GZIP compression in S3 storage.

											
										
										
											2020-09-30 13:09:55 +00:00
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-03-03 08:56:15 +00:00
+								    run_query(instance, f"SELECT sum(id) FROM {name}").splitlines() == ["708"]
-												Added test for GZIP in S3 storage.

											
										
										
											2020-09-28 23:30:41 +00:00
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-03-03 08:56:15 +00:00
+								    buf = io.BytesIO(get_s3_file_content(started_cluster, bucket, filename, decode=False))
 								    f = gzip.GzipFile(fileobj=buf, mode="rb")
 								    uncompressed_content = f.read().decode()
-												Maybe Minio starts for too long in tests

											
										
										
											2021-06-06 09:38:49 +00:00
+								    assert sum([ int(i.split(',')[1]) for i in uncompressed_content.splitlines() ]) == 708
-												Truncate for s3

											
										
										
											2021-06-21 15:44:36 +00:00
 								def test_truncate_table(started_cluster):
 								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
 								    name = "truncate"
 								    instance.query("CREATE TABLE {} (id UInt32) ENGINE = S3('http://{}:{}/{}/{}', 'CSV')".format(
 								        name, started_cluster.minio_ip, MINIO_INTERNAL_PORT, bucket, name))
 								    instance.query("INSERT INTO {} SELECT number FROM numbers(10)".format(name))
 								    result = instance.query("SELECT * FROM {}".format(name))
 								    assert result == instance.query("SELECT number FROM numbers(10)")
 								    instance.query("TRUNCATE TABLE {}".format(name))
 								    minio = started_cluster.minio_client
 								    timeout = 30
 								    while timeout > 0:
 								        if len(list(minio.list_objects(started_cluster.minio_bucket, 'truncate/'))) == 0:
 								            return
 								        timeout -= 1
 								        time.sleep(1)
 								    assert(len(list(minio.list_objects(started_cluster.minio_bucket, 'truncate/'))) == 0)
 								    assert instance.query("SELECT * FROM {}".format(name)) == ""
-												Tests, url table function

											
										
										
											2021-09-08 19:28:22 +00:00
 								def test_predefined_connection_configuration(started_cluster):
 								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
 								    name = "test_table"
 								    instance.query("drop table if exists {}".format(name))
 								    instance.query("CREATE TABLE {} (id UInt32) ENGINE = S3(s3_conf1, format='CSV')".format(name))
 								    instance.query("INSERT INTO {} SELECT number FROM numbers(10)".format(name))
 								    result = instance.query("SELECT * FROM {}".format(name))
 								    assert result == instance.query("SELECT number FROM numbers(10)")
 								    result = instance.query("SELECT * FROM s3(s3_conf1, format='CSV', structure='id UInt32')")
 								    assert result == instance.query("SELECT number FROM numbers(10)")
-												Reduce memory usage for some formats

											
										
										
											2021-10-31 19:53:24 +00:00
-												add test

											
										
										
											2021-11-09 20:11:02 +00:00
+								result = ""
 								def test_url_reconnect_in_the_middle(started_cluster):
 								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
 								    table_format = "id String, data String"
 								    filename = "test_url_reconnect_{}.tsv".format(random.randint(0, 1000))
 								    instance.query(f"""insert into table function
 								                   s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'TSV', '{table_format}')
 								                   select number, randomPrintableASCII(number % 1000) from numbers(1000000)""")
 								    with PartitionManager() as pm:
 								        pm_rule_reject = {'probability': 0.02, 'destination': instance.ip_address, 'source_port': started_cluster.minio_port, 'action': 'REJECT --reject-with tcp-reset'}
 								        pm_rule_drop_all = {'destination': instance.ip_address, 'source_port': started_cluster.minio_port, 'action': 'DROP'}
 								        pm._add_rule(pm_rule_reject)
 								        def select():
 								            global result
 								            result = instance.query(
 								                f"""select sum(cityHash64(x)) from (select toUInt64(id) + sleep(0.1) as x from
 								                url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'TSV', '{table_format}')
 								                settings http_max_tries = 10, http_retry_max_backoff_ms=2000, http_send_timeout=1, http_receive_timeout=1)""")
 								            assert(int(result), 3914219105369203805)
 								        thread = threading.Thread(target=select)
 								        thread.start()
 								        time.sleep(4)
 								        pm._add_rule(pm_rule_drop_all)
 								        time.sleep(2)
 								        pm._delete_rule(pm_rule_drop_all)
 								        pm._delete_rule(pm_rule_reject)
 								        thread.join()
 								        assert(int(result), 3914219105369203805)
-												Merge branch 'master' of github.com:ClickHouse/ClickHouse into seekable-read-buffers

											
										
										
											2021-11-13 11:38:57 +00:00
-												Reduce memory usage for some formats

											
										
										
											2021-10-31 19:53:24 +00:00
+								def test_seekable_formats(started_cluster):
 								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
 								    table_function = f"s3(s3_parquet, structure='a Int32, b String', format='Parquet')"
-												Add one more test

											
										
										
											2021-11-20 12:01:45 +00:00
+								    instance.query(f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(5000000)")
-												Reduce memory usage for some formats

											
										
										
											2021-10-31 19:53:24 +00:00
 								    result = instance.query(f"SELECT count() FROM {table_function}")
 								    assert(int(result) == 5000000)
 								    table_function = f"s3(s3_orc, structure='a Int32, b String', format='ORC')"
-												Fix test

											
										
										
											2021-12-22 10:57:08 +00:00
+								    exec_query_with_retry(instance, f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(5000000)")
-												Reduce memory usage for some formats

											
										
										
											2021-10-31 19:53:24 +00:00
 								    result = instance.query(f"SELECT count() FROM {table_function}")
 								    assert(int(result) == 5000000)
-												Add one more test

											
										
										
											2021-11-20 12:01:45 +00:00
-												Fix test_storage_s3 flakiness

											
										
										
											2021-12-16 04:56:43 +00:00
+								    instance.query("SYSTEM FLUSH LOGS")
-												Add one more test

											
										
										
											2021-11-20 12:01:45 +00:00
+								    result = instance.query(f"SELECT formatReadableSize(memory_usage) FROM system.query_log WHERE startsWith(query, 'SELECT count() FROM s3') AND memory_usage > 0 ORDER BY event_time desc")
 								    print(result[:3])
 								    assert(int(result[:3]) < 200)
 								def test_seekable_formats_url(started_cluster):
 								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
 								    table_function = f"s3(s3_parquet, structure='a Int32, b String', format='Parquet')"
 								    instance.query(f"insert into table function {table_function} select number, randomString(100) from numbers(5000000)")
 								    table_function = f"url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_parquet', 'Parquet', 'a Int32, b String')"
 								    result = instance.query(f"SELECT count() FROM {table_function}")
 								    assert(int(result) == 5000000)
 								    table_function = f"s3(s3_orc, structure='a Int32, b String', format='ORC')"
-												Fix test

											
										
										
											2021-12-22 10:57:08 +00:00
+								    exec_query_with_retry(instance, f"insert into table function {table_function} select number, randomString(100) from numbers(5000000)")
-												Add one more test

											
										
										
											2021-11-20 12:01:45 +00:00
 								    table_function = f"url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_orc', 'ORC', 'a Int32, b String')"
 								    result = instance.query(f"SELECT count() FROM {table_function}")
 								    assert(int(result) == 5000000)
-												Fix test_storage_s3 flakiness

											
										
										
											2021-12-16 04:56:43 +00:00
+								    instance.query("SYSTEM FLUSH LOGS")
-												Add one more test

											
										
										
											2021-11-20 12:01:45 +00:00
+								    result = instance.query(f"SELECT formatReadableSize(memory_usage) FROM system.query_log WHERE startsWith(query, 'SELECT count() FROM url') AND memory_usage > 0 ORDER BY event_time desc")
 								    print(result[:3])
 								    assert(int(result[:3]) < 200)
-												Fix

											
										
										
											2021-12-22 08:42:23 +00:00
 								def test_empty_file(started_cluster):
 								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
 								    name = "empty"
 								    url = f'http://{started_cluster.minio_ip}:{MINIO_INTERNAL_PORT}/{bucket}/{name}'
 								    minio = started_cluster.minio_client
 								    minio.put_object(bucket, name, io.BytesIO(b""), 0)
 								    table_function = f"s3('{url}', 'CSV', 'id Int32')"
 								    result = instance.query(f"SELECT count() FROM {table_function}")
 								    assert(int(result) == 0)
-												Don't allow to write into S3 if path contains globs

											
										
										
											2021-12-24 14:13:35 +00:00
 								def test_insert_with_path_with_globs(started_cluster):
 								    instance = started_cluster.instances["dummy"]
 								    table_function_3 = f"s3('http://minio1:9001/root/test_parquet*', 'minio', 'minio123', 'Parquet', 'a Int32, b String')"
 								    instance.query_and_get_error(f"insert into table function {table_function_3} SELECT number, randomString(100) FROM numbers(500)")