ClickHouse/tests/integration/test_storage_s3/test.py

import gzip
import uuid
import logging
import os
import io
import random
import threading
import time

import helpers.client
import pytest
from helpers.cluster import ClickHouseCluster, ClickHouseInstance
from helpers.network import PartitionManager
from helpers.mock_servers import start_mock_servers
from helpers.test_tools import exec_query_with_retry
from helpers.s3_tools import prepare_s3_bucket

MINIO_INTERNAL_PORT = 9001

SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))


def put_s3_file_content(started_cluster, bucket, filename, data):
    buf = io.BytesIO(data)
    started_cluster.minio_client.put_object(bucket, filename, buf, len(data))


# Returns content of given S3 file as string.
def get_s3_file_content(started_cluster, bucket, filename, decode=True):
    # type: (ClickHouseCluster, str, str, bool) -> str

    data = started_cluster.minio_client.get_object(bucket, filename)
    data_str = b""
    for chunk in data.stream():
        data_str += chunk
    if decode:
        return data_str.decode()
    return data_str


@pytest.fixture(scope="module")
def started_cluster():
    try:
        cluster = ClickHouseCluster(__file__)
        cluster.add_instance(
            "restricted_dummy",
            main_configs=["configs/config_for_test_remote_host_filter.xml"],
            with_minio=True,
        )
        cluster.add_instance(
            "dummy",
            with_minio=True,
            main_configs=[
                "configs/defaultS3.xml",
                "configs/named_collections.xml",
                "configs/schema_cache.xml",
                "configs/blob_log.xml",
            ],
            user_configs=[
                "configs/access.xml",
                "configs/users.xml",
                "configs/s3_retry.xml",
            ],
        )
        cluster.add_instance(
            "dummy_without_named_collections",
            with_minio=True,
            main_configs=[
                "configs/defaultS3.xml",
                "configs/named_collections.xml",
                "configs/schema_cache.xml",
            ],
            user_configs=["configs/access.xml"],
        )
        cluster.add_instance(
            "s3_max_redirects",
            with_minio=True,
            main_configs=["configs/defaultS3.xml"],
            user_configs=["configs/s3_max_redirects.xml", "configs/s3_retry.xml"],
        )
        cluster.add_instance(
            "s3_non_default",
            with_minio=True,
        )
        cluster.add_instance(
            "s3_with_environment_credentials",
            with_minio=True,
            env_variables={
                "AWS_ACCESS_KEY_ID": "minio",
                "AWS_SECRET_ACCESS_KEY": "minio123",
            },
            main_configs=["configs/use_environment_credentials.xml"],
        )

        logging.info("Starting cluster...")
        cluster.start()
        logging.info("Cluster started")

        prepare_s3_bucket(cluster)
        logging.info("S3 bucket created")
        run_s3_mocks(cluster)

        yield cluster
    finally:
        cluster.shutdown()


def run_query(instance, query, *args, **kwargs):
    logging.info("Running query '{}'...".format(query))
    result = instance.query(query, *args, **kwargs)
    logging.info("Query finished")

    return result


# Test simple put. Also checks that wrong credentials produce an error with every compression method.
@pytest.mark.parametrize(
    "maybe_auth,positive,compression",
    [
        pytest.param("", True, "auto", id="positive"),
        pytest.param("'minio','minio123',", True, "auto", id="auth_positive"),
        pytest.param("'wrongid','wrongkey',", False, "auto", id="auto"),
        pytest.param("'wrongid','wrongkey',", False, "gzip", id="gzip"),
        pytest.param("'wrongid','wrongkey',", False, "deflate", id="deflate"),
        pytest.param("'wrongid','wrongkey',", False, "brotli", id="brotli"),
        pytest.param("'wrongid','wrongkey',", False, "xz", id="xz"),
        pytest.param("'wrongid','wrongkey',", False, "zstd", id="zstd"),
    ],
)
def test_put(started_cluster, maybe_auth, positive, compression):
    # type: (ClickHouseCluster, str, bool, str) -> None

    bucket = (
        started_cluster.minio_bucket
        if not maybe_auth
        else started_cluster.minio_restricted_bucket
    )
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
    values = "(1, 2, 3), (3, 2, 1), (78, 43, 45)"
    values_csv = "1,2,3\n3,2,1\n78,43,45\n"
    filename = "test.csv"
    put_query = f"""insert into table function s3('http://{started_cluster.minio_ip}:{started_cluster.minio_port}/{bucket}/{filename}',
                    {maybe_auth}'CSV', '{table_format}', '{compression}') settings s3_truncate_on_insert=1 values {values}"""

    try:
        run_query(instance, put_query)
    except helpers.client.QueryRuntimeException:
        if positive:
            raise
    else:
        assert positive
        assert values_csv == get_s3_file_content(started_cluster, bucket, filename)


def test_partition_by(started_cluster):
    id = uuid.uuid4()
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
    partition_by = "column3"
    values = "(1, 2, 3), (3, 2, 1), (78, 43, 45)"
    filename = "test_{_partition_id}.csv"
    put_query = f"""INSERT INTO TABLE FUNCTION
        s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{filename}', 'CSV', '{table_format}')
        PARTITION BY {partition_by} VALUES {values}"""

    run_query(instance, put_query)

    assert "1,2,3\n" == get_s3_file_content(started_cluster, bucket, f"{id}/test_3.csv")
    assert "3,2,1\n" == get_s3_file_content(started_cluster, bucket, f"{id}/test_1.csv")
    assert "78,43,45\n" == get_s3_file_content(
        started_cluster, bucket, f"{id}/test_45.csv"
    )

    filename = "test2_{_partition_id}.csv"
    instance.query(
        f"create table p ({table_format}) engine=S3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{filename}', 'CSV') partition by column3"
    )
    instance.query(f"insert into p values {values}")
    assert "1,2,3\n" == get_s3_file_content(
        started_cluster, bucket, f"{id}/test2_3.csv"
    )
    assert "3,2,1\n" == get_s3_file_content(
        started_cluster, bucket, f"{id}/test2_1.csv"
    )
    assert "78,43,45\n" == get_s3_file_content(
        started_cluster, bucket, f"{id}/test2_45.csv"
    )

    instance.query("drop table p")


def test_partition_by_string_column(started_cluster):
    id = uuid.uuid4()
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    table_format = "col_num UInt32, col_str String"
    partition_by = "col_str"
    values = "(1, 'foo/bar'), (3, 'йцук'), (78, '你好')"
    filename = "test_{_partition_id}.csv"
    put_query = f"""INSERT INTO TABLE FUNCTION
        s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{filename}', 'CSV', '{table_format}')
        PARTITION BY {partition_by} VALUES {values}"""

    run_query(instance, put_query)

    assert '1,"foo/bar"\n' == get_s3_file_content(
        started_cluster, bucket, f"{id}/test_foo/bar.csv"
    )
    assert '3,"йцук"\n' == get_s3_file_content(
        started_cluster, bucket, f"{id}/test_йцук.csv"
    )
    assert '78,"你好"\n' == get_s3_file_content(
        started_cluster, bucket, f"{id}/test_你好.csv"
    )


def test_partition_by_const_column(started_cluster):
    id = uuid.uuid4()
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
    values = "(1, 2, 3), (3, 2, 1), (78, 43, 45)"
    partition_by = "'88'"
    values_csv = "1,2,3\n3,2,1\n78,43,45\n"
    filename = "test_{_partition_id}.csv"
    put_query = f"""INSERT INTO TABLE FUNCTION
        s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{filename}', 'CSV', '{table_format}')
        PARTITION BY {partition_by} VALUES {values}"""

    run_query(instance, put_query)

    assert values_csv == get_s3_file_content(
        started_cluster, bucket, f"{id}/test_88.csv"
    )


@pytest.mark.parametrize("special", ["space", "plus"])
def test_get_file_with_special(started_cluster, special):
    symbol = {"space": " ", "plus": "+"}[special]
    urlsafe_symbol = {"space": "%20", "plus": "%2B"}[special]
    auth = "'minio','minio123',"
    bucket = started_cluster.minio_restricted_bucket
    instance = started_cluster.instances["dummy"]
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
    values = [
        [12549, 2463, 19893],
        [64021, 38652, 66703],
        [81611, 39650, 83516],
        [11079, 59507, 61546],
        [51764, 69952, 6876],
        [41165, 90293, 29095],
        [40167, 78432, 48309],
        [81629, 81327, 11855],
        [55852, 21643, 98507],
        [6738, 54643, 41155],
    ]
    values_csv = (
        "\n".join((",".join(map(str, row)) for row in values)) + "\n"
    ).encode()
    filename = f"get_file_with_{special}_{symbol}two.csv"
    put_s3_file_content(started_cluster, bucket, filename, values_csv)

    get_query = f"SELECT * FROM s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/get_file_with_{special}_{urlsafe_symbol}two.csv', {auth}'CSV', '{table_format}') FORMAT TSV"
    assert [
        list(map(int, l.split())) for l in run_query(instance, get_query).splitlines()
    ] == values

    get_query = f"SELECT * FROM s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/get_file_with_{special}*.csv', {auth}'CSV', '{table_format}') FORMAT TSV"
    assert [
        list(map(int, l.split())) for l in run_query(instance, get_query).splitlines()
    ] == values

    get_query = f"SELECT * FROM s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/get_file_with_{special}_{urlsafe_symbol}*.csv', {auth}'CSV', '{table_format}') FORMAT TSV"
    assert [
        list(map(int, l.split())) for l in run_query(instance, get_query).splitlines()
    ] == values


@pytest.mark.parametrize("special", ["space", "plus", "plus2"])
def test_get_path_with_special(started_cluster, special):
    symbol = {"space": "%20", "plus": "%2B", "plus2": "%2B"}[special]
    safe_symbol = {"space": "%20", "plus": "+", "plus2": "%2B"}[special]
    auth = "'minio','minio123',"
    table_format = "column1 String"
    instance = started_cluster.instances["dummy"]
    get_query = f"SELECT * FROM s3('http://resolver:8082/get-my-path/{safe_symbol}.csv', {auth}'CSV', '{table_format}') FORMAT TSV"
    assert run_query(instance, get_query).splitlines() == [f"/{symbol}.csv"]


# Test put no data to S3.
@pytest.mark.parametrize("auth", [pytest.param("'minio','minio123',", id="minio")])
def test_empty_put(started_cluster, auth):
    # type: (ClickHouseCluster, str) -> None
    id = uuid.uuid4()
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"

    drop_empty_table_query = "DROP TABLE IF EXISTS empty_table"
    create_empty_table_query = (
        f"CREATE TABLE empty_table ({table_format}) ENGINE = Null()"
    )

    run_query(instance, drop_empty_table_query)
    run_query(instance, create_empty_table_query)

    filename = "empty_put_test.csv"
    put_query = f"""insert into table function
        s3('http://{started_cluster.minio_ip}:{MINIO_INTERNAL_PORT}/{bucket}/{id}/{filename}', {auth} 'CSV', '{table_format}')
        select * from empty_table"""

    run_query(instance, put_query)

    assert (
        run_query(
            instance,
            f"""select count(*) from
            s3('http://{started_cluster.minio_ip}:{MINIO_INTERNAL_PORT}/{bucket}/{id}/{filename}', {auth} 'CSV', '{table_format}')""",
        )
        == "0\n"
    )


# Test put values in CSV format.
@pytest.mark.parametrize(
    "maybe_auth,positive",
    [
        pytest.param("", True, id="positive"),
        pytest.param("'minio','minio123',", True, id="auth_positive"),
        pytest.param("'wrongid','wrongkey',", False, id="negative"),
    ],
)
def test_put_csv(started_cluster, maybe_auth, positive):
    # type: (ClickHouseCluster, bool, str) -> None

    bucket = (
        started_cluster.minio_bucket
        if not maybe_auth
        else started_cluster.minio_restricted_bucket
    )
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
    filename = "test.csv"
    put_query = "insert into table function s3('http://{}:{}/{}/{}', {}'CSV', '{}') settings s3_truncate_on_insert=1 format CSV".format(
        started_cluster.minio_ip,
        MINIO_INTERNAL_PORT,
        bucket,
        filename,
        maybe_auth,
        table_format,
    )
    csv_data = "8,9,16\n11,18,13\n22,14,2\n"

    try:
        run_query(instance, put_query, stdin=csv_data)
    except helpers.client.QueryRuntimeException:
        if positive:
            raise
    else:
        assert positive
        assert csv_data == get_s3_file_content(started_cluster, bucket, filename)


# Test put and get with S3 server redirect.
def test_put_get_with_redirect(started_cluster):
    # type: (ClickHouseCluster) -> None

    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
    values = "(1, 1, 1), (1, 1, 1), (11, 11, 11)"
    values_csv = "1,1,1\n1,1,1\n11,11,11\n"
    filename = "test.csv"
    query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') settings s3_truncate_on_insert=1 values {}".format(
        started_cluster.minio_redirect_host,
        started_cluster.minio_redirect_port,
        bucket,
        filename,
        table_format,
        values,
    )
    run_query(instance, query)

    assert values_csv == get_s3_file_content(started_cluster, bucket, filename)

    query = "select *, column1*column2*column3 from s3('http://{}:{}/{}/{}', 'CSV', '{}')".format(
        started_cluster.minio_redirect_host,
        started_cluster.minio_redirect_port,
        bucket,
        filename,
        table_format,
    )
    stdout = run_query(instance, query)

    assert list(map(str.split, stdout.splitlines())) == [
        ["1", "1", "1", "1"],
        ["1", "1", "1", "1"],
        ["11", "11", "11", "1331"],
    ]


# Test put with restricted S3 server redirect.
def test_put_with_zero_redirect(started_cluster):
    # type: (ClickHouseCluster) -> None

    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["s3_max_redirects"]  # type: ClickHouseInstance
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
    values = "(1, 1, 1), (1, 1, 1), (11, 11, 11)"
    filename = "test.csv"

    # Should work without redirect
    query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') settings s3_truncate_on_insert=1 values {}".format(
        started_cluster.minio_ip,
        MINIO_INTERNAL_PORT,
        bucket,
        filename,
        table_format,
        values,
    )
    run_query(instance, query)

    # Should not work with redirect
    query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') settings s3_truncate_on_insert=1 values {}".format(
        started_cluster.minio_redirect_host,
        started_cluster.minio_redirect_port,
        bucket,
        filename,
        table_format,
        values,
    )
    exception_raised = False
    try:
        run_query(instance, query)
    except Exception as e:
        assert str(e).find("Too many redirects while trying to access") != -1
        exception_raised = True
    finally:
        assert exception_raised


def test_put_get_with_globs(started_cluster):
    # type: (ClickHouseCluster) -> None
    unique_prefix = random.randint(1, 10000)
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
    max_path = ""
    for i in range(10):
        for j in range(10):
            path = "{}/{}_{}/{}.csv".format(
                unique_prefix, i, random.choice(["a", "b", "c", "d"]), j
            )
            max_path = max(path, max_path)
            values = "({},{},{})".format(i, j, i + j)
            query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format(
                started_cluster.minio_ip,
                MINIO_INTERNAL_PORT,
                bucket,
                path,
                table_format,
                values,
            )
            run_query(instance, query)

    query = "select sum(column1), sum(column2), sum(column3), min(_file), max(_path) from s3('http://{}:{}/{}/{}/*_{{a,b,c,d}}/%3f.csv', 'CSV', '{}')".format(
        started_cluster.minio_redirect_host,
        started_cluster.minio_redirect_port,
        bucket,
        unique_prefix,
        table_format,
    )
    assert run_query(instance, query).splitlines() == [
        "450\t450\t900\t0.csv\t{bucket}/{max_path}".format(
            bucket=bucket, max_path=max_path
        )
    ]

    minio = started_cluster.minio_client
    for obj in list(
        minio.list_objects(
            started_cluster.minio_bucket,
            prefix="{}/".format(unique_prefix),
            recursive=True,
        )
    ):
        minio.remove_object(started_cluster.minio_bucket, obj.object_name)


# Test multipart put.
@pytest.mark.parametrize(
    "maybe_auth,positive",
    [
        pytest.param("", True, id="positive"),
        pytest.param("'wrongid','wrongkey'", False, id="negative"),
        # ("'minio','minio123',",True), Redirect with credentials not working with nginx.
    ],
)
def test_multipart(started_cluster, maybe_auth, positive):
    # type: (ClickHouseCluster, str, bool) -> None

    id = uuid.uuid4()
    bucket = (
        started_cluster.minio_bucket
        if not maybe_auth
        else started_cluster.minio_restricted_bucket
    )
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"

    # Minimum size of part is 5 Mb for Minio.
    # See: https://github.com/minio/minio/blob/master/docs/minio-limits.md
    min_part_size_bytes = 5 * 1024 * 1024
    csv_size_bytes = int(min_part_size_bytes * 1.5)  # To have 2 parts.

    one_line_length = 6  # 3 digits, 2 commas, 1 line separator.

    total_rows = csv_size_bytes // one_line_length
    # Generate data having size more than one part
    int_data = [[1, 2, 3] for i in range(total_rows)]
    csv_data = "".join(["{},{},{}\n".format(x, y, z) for x, y, z in int_data])

    assert len(csv_data) > min_part_size_bytes

    filename = f"{id}/test_multipart.csv"
    put_query = "insert into table function s3('http://{}:{}/{}/{}', {}'CSV', '{}') format CSV".format(
        started_cluster.minio_redirect_host,
        started_cluster.minio_redirect_port,
        bucket,
        filename,
        maybe_auth,
        table_format,
    )
    put_query_id = uuid.uuid4().hex
    try:
        run_query(
            instance,
            put_query,
            stdin=csv_data,
            settings={
                "s3_min_upload_part_size": min_part_size_bytes,
                "s3_max_single_part_upload_size": 0,
            },
            query_id=put_query_id,
        )
    except helpers.client.QueryRuntimeException:
        if positive:
            raise
    else:
        assert positive

        # Use proxy access logs to count number of parts uploaded to Minio.
        proxy_logs = started_cluster.get_container_logs("proxy1")  # type: str
        assert proxy_logs.count("PUT /{}/{}".format(bucket, filename)) >= 2

        assert csv_data == get_s3_file_content(started_cluster, bucket, filename)

    # select uploaded data from many threads
    select_query = (
        "select sum(column1), sum(column2), sum(column3) "
        "from s3('http://{host}:{port}/{bucket}/{filename}', {auth}'CSV', '{table_format}')".format(
            host=started_cluster.minio_redirect_host,
            port=started_cluster.minio_redirect_port,
            bucket=bucket,
            filename=filename,
            auth=maybe_auth,
            table_format=table_format,
        )
    )
    try:
        select_result = run_query(
            instance,
            select_query,
            settings={
                "max_download_threads": random.randint(4, 16),
                "max_download_buffer_size": 1024 * 1024,
            },
        )
    except helpers.client.QueryRuntimeException:
        if positive:
            raise
    else:
        assert positive
        assert (
            select_result
            == "\t".join(map(str, [total_rows, total_rows * 2, total_rows * 3])) + "\n"
        )

    if positive:
        instance.query("SYSTEM FLUSH LOGS")
        blob_storage_log = instance.query(f"SELECT * FROM system.blob_storage_log")

        result = instance.query(
            f"""SELECT
                countIf(event_type == 'MultiPartUploadCreate'),
                countIf(event_type == 'MultiPartUploadWrite'),
                countIf(event_type == 'MultiPartUploadComplete'),
                count()
            FROM system.blob_storage_log WHERE query_id = '{put_query_id}'"""
        )
        r = result.strip().split("\t")
        assert int(r[0]) == 1, blob_storage_log
        assert int(r[1]) >= 1, blob_storage_log
        assert int(r[2]) == 1, blob_storage_log
        assert int(r[0]) + int(r[1]) + int(r[2]) == int(r[3]), blob_storage_log


def test_remote_host_filter(started_cluster):
    instance = started_cluster.instances["restricted_dummy"]
    format = "column1 UInt32, column2 UInt32, column3 UInt32"

    query = "select *, column1*column2*column3 from s3('http://{}:{}/{}/test.csv', 'CSV', '{}')".format(
        "invalid_host", MINIO_INTERNAL_PORT, started_cluster.minio_bucket, format
    )
    assert "not allowed in configuration file" in instance.query_and_get_error(query)

    other_values = "(1, 1, 1), (1, 1, 1), (11, 11, 11)"
    query = "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') values {}".format(
        "invalid_host",
        MINIO_INTERNAL_PORT,
        started_cluster.minio_bucket,
        format,
        other_values,
    )
    assert "not allowed in configuration file" in instance.query_and_get_error(query)


def test_wrong_s3_syntax(started_cluster):
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    expected_err_msg = "Code: 42"  # NUMBER_OF_ARGUMENTS_DOESNT_MATCH

    query = "create table test_table_s3_syntax (id UInt32) ENGINE = S3('', '', '', '', '', '', '')"
    assert expected_err_msg in instance.query_and_get_error(query)

    expected_err_msg = "Code: 36"  # BAD_ARGUMENTS

    query = "create table test_table_s3_syntax (id UInt32) ENGINE = S3('')"
    assert expected_err_msg in instance.query_and_get_error(query)


# https://en.wikipedia.org/wiki/One_Thousand_and_One_Nights
def test_s3_glob_scheherazade(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
    max_path = ""
    values = "(1, 1, 1)"
    nights_per_job = 1001 // 30
    jobs = []
    for night in range(0, 1001, nights_per_job):

        def add_tales(start, end):
            for i in range(start, end):
                path = "night_{}/tale.csv".format(i)
                query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format(
                    started_cluster.minio_ip,
                    MINIO_INTERNAL_PORT,
                    bucket,
                    path,
                    table_format,
                    values,
                )
                run_query(instance, query)

        jobs.append(
            threading.Thread(
                target=add_tales, args=(night, min(night + nights_per_job, 1001))
            )
        )
        jobs[-1].start()

    for job in jobs:
        job.join()

    query = "select count(), sum(column1), sum(column2), sum(column3) from s3('http://{}:{}/{}/night_*/tale.csv', 'CSV', '{}')".format(
        started_cluster.minio_redirect_host,
        started_cluster.minio_redirect_port,
        bucket,
        table_format,
    )
    assert run_query(instance, query).splitlines() == ["1001\t1001\t1001\t1001"]


# a bit simplified version of scheherazade test
# checks e.g. `prefix{1,2}/file*.csv`, where there are more than 1000 files under prefix1.
def test_s3_glob_many_objects_under_selection(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
    values = "(1, 1, 1)"
    jobs = []
    for thread_num in range(16):

        def create_files(thread_num):
            for f_num in range(thread_num * 63, thread_num * 63 + 63):
                path = f"folder1/file{f_num}.csv"
                query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') settings s3_truncate_on_insert=1 values {}".format(
                    started_cluster.minio_ip,
                    MINIO_INTERNAL_PORT,
                    bucket,
                    path,
                    table_format,
                    values,
                )
                run_query(instance, query)

        jobs.append(threading.Thread(target=create_files, args=(thread_num,)))
        jobs[-1].start()

    query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') settings s3_truncate_on_insert=1 values {}".format(
        started_cluster.minio_ip,
        MINIO_INTERNAL_PORT,
        bucket,
        f"folder2/file0.csv",
        table_format,
        values,
    )
    run_query(instance, query)

    for job in jobs:
        job.join()

    query = "select count(), sum(column1), sum(column2), sum(column3) from s3('http://{}:{}/{}/folder{{1,2}}/file*.csv', 'CSV', '{}')".format(
        started_cluster.minio_redirect_host,
        started_cluster.minio_redirect_port,
        bucket,
        table_format,
    )
    assert run_query(instance, query).splitlines() == ["1009\t1009\t1009\t1009"]


def run_s3_mocks(started_cluster):
    script_dir = os.path.join(os.path.dirname(__file__), "s3_mocks")
    start_mock_servers(
        started_cluster,
        script_dir,
        [
            ("mock_s3.py", "resolver", "8080"),
            ("unstable_server.py", "resolver", "8081"),
            ("echo.py", "resolver", "8082"),
            ("no_list_objects.py", "resolver", "8083"),
        ],
    )


def replace_config(path, old, new):
    config = open(path, "r")
    config_lines = config.readlines()
    config.close()
    config_lines = [line.replace(old, new) for line in config_lines]
    config = open(path, "w")
    config.writelines(config_lines)
    config.close()


def test_custom_auth_headers(started_cluster):
    config_path = os.path.join(
        SCRIPT_DIR,
        "./{}/dummy/configs/config.d/defaultS3.xml".format(
            started_cluster.instances_dir_name
        ),
    )

    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
    filename = "test.csv"
    get_query = "select * from s3('http://resolver:8080/{bucket}/{file}', 'CSV', '{table_format}')".format(
        bucket=started_cluster.minio_restricted_bucket,
        file=filename,
        table_format=table_format,
    )

    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    result = run_query(instance, get_query)
    assert result == "1\t2\t3\n"

    instance.query("DROP TABLE IF EXISTS test")
    instance.query(
        "CREATE TABLE test ({table_format}) ENGINE = S3('http://resolver:8080/{bucket}/{file}', 'CSV')".format(
            bucket=started_cluster.minio_restricted_bucket,
            file=filename,
            table_format=table_format,
        )
    )
    assert run_query(instance, "SELECT * FROM test") == "1\t2\t3\n"

    replace_config(
        config_path,
        "<header>Authorization: Bearer TOKEN",
        "<header>Authorization: Bearer INVALID_TOKEN",
    )
    instance.query("SYSTEM RELOAD CONFIG")
    ret, err = instance.query_and_get_answer_with_error("SELECT * FROM test")
    assert ret == "" and err != ""
    replace_config(
        config_path,
        "<header>Authorization: Bearer INVALID_TOKEN",
        "<header>Authorization: Bearer TOKEN",
    )
    instance.query("SYSTEM RELOAD CONFIG")
    assert run_query(instance, "SELECT * FROM test") == "1\t2\t3\n"
    instance.query("DROP TABLE test")


def test_custom_auth_headers_exclusion(started_cluster):
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
    filename = "test.csv"
    get_query = f"SELECT * FROM s3('http://resolver:8080/{started_cluster.minio_restricted_bucket}/restricteddirectory/{filename}', 'CSV', '{table_format}')"

    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    with pytest.raises(helpers.client.QueryRuntimeException) as ei:
        result = run_query(instance, get_query)
        print(result)

    assert ei.value.returncode == 243
    assert "HTTP response code: 403" in ei.value.stderr


def test_infinite_redirect(started_cluster):
    bucket = "redirected"
    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
    filename = "test.csv"
    get_query = f"select * from s3('http://resolver:{started_cluster.minio_redirect_port}/{bucket}/{filename}', 'CSV', '{table_format}')"
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    exception_raised = False
    try:
        run_query(instance, get_query)
    except Exception as e:
        assert str(e).find("Too many redirects while trying to access") != -1
        exception_raised = True
    finally:
        assert exception_raised


@pytest.mark.parametrize(
    "extension,method",
    [pytest.param("bin", "gzip", id="bin"), pytest.param("gz", "auto", id="gz")],
)
def test_storage_s3_get_gzip(started_cluster, extension, method):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]
    filename = f"test_get_gzip.{extension}"
    name = f"test_get_gzip_{extension}"
    data = [
        "Sophia Intrieri,55",
        "Jack Taylor,71",
        "Christopher Silva,66",
        "Clifton Purser,35",
        "Richard Aceuedo,43",
        "Lisa Hensley,31",
        "Alice Wehrley,1",
        "Mary Farmer,47",
        "Samara Ramirez,19",
        "Shirley Lloyd,51",
        "Santos Cowger,0",
        "Richard Mundt,88",
        "Jerry Gonzalez,15",
        "Angela James,10",
        "Norman Ortega,33",
        "",
    ]
    run_query(instance, f"DROP TABLE IF EXISTS {name}")

    buf = io.BytesIO()
    compressed = gzip.GzipFile(fileobj=buf, mode="wb")
    compressed.write(("\n".join(data)).encode())
    compressed.close()
    put_s3_file_content(started_cluster, bucket, filename, buf.getvalue())

    run_query(
        instance,
        f"""CREATE TABLE {name} (name String, id UInt32) ENGINE = S3(
                                'http://{started_cluster.minio_ip}:{MINIO_INTERNAL_PORT}/{bucket}/{filename}',
                                'CSV',
                                '{method}')""",
    )

    run_query(instance, f"SELECT sum(id) FROM {name}").splitlines() == ["565"]
    run_query(instance, f"DROP TABLE {name}")


def test_storage_s3_get_unstable(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]
    table_format = "column1 Int64, column2 Int64, column3 Int64, column4 Int64"
    get_query = f"SELECT count(), sum(column3), sum(column4) FROM s3('http://resolver:8081/{started_cluster.minio_bucket}/test.csv', 'CSV', '{table_format}') SETTINGS s3_max_single_read_retries=30 FORMAT CSV"
    result = run_query(instance, get_query)
    assert result.splitlines() == ["500001,500000,0"]


def test_storage_s3_get_slow(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]
    table_format = "column1 Int64, column2 Int64, column3 Int64, column4 Int64"
    get_query = f"SELECT count(), sum(column3), sum(column4) FROM s3('http://resolver:8081/{started_cluster.minio_bucket}/slow_send_test.csv', 'CSV', '{table_format}') FORMAT CSV"
    result = run_query(instance, get_query)
    assert result.splitlines() == ["500001,500000,0"]


def test_storage_s3_put_uncompressed(started_cluster):
    id = uuid.uuid4()
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]
    filename = f"{id}/test_put_uncompressed.bin"
    name = "test_put_uncompressed"
    data = [
        "'Gloria Thompson',99",
        "'Matthew Tang',98",
        "'Patsy Anderson',23",
        "'Nancy Badillo',93",
        "'Roy Hunt',5",
        "'Adam Kirk',51",
        "'Joshua Douds',28",
        "'Jolene Ryan',0",
        "'Roxanne Padilla',50",
        "'Howard Roberts',41",
        "'Ricardo Broughton',13",
        "'Roland Speer',83",
        "'Cathy Cohan',58",
        "'Kathie Dawson',100",
        "'Gregg Mcquistion',11",
    ]
    run_query(
        instance,
        "CREATE TABLE {} (name String, id UInt32) ENGINE = S3('http://{}:{}/{}/{}', 'CSV')".format(
            name, started_cluster.minio_ip, MINIO_INTERNAL_PORT, bucket, filename
        ),
    )
    insert_query_id = uuid.uuid4().hex
    data_sep = "),("
    run_query(
        instance,
        "INSERT INTO {} VALUES ({})".format(name, data_sep.join(data)),
        query_id=insert_query_id,
    )

    run_query(instance, "SELECT sum(id) FROM {}".format(name)).splitlines() == ["753"]

    uncompressed_content = get_s3_file_content(started_cluster, bucket, filename)
    assert sum([int(i.split(",")[1]) for i in uncompressed_content.splitlines()]) == 753

    instance.query("SYSTEM FLUSH LOGS")
    blob_storage_log = instance.query(f"SELECT * FROM system.blob_storage_log")

    result = instance.query(
        f"""SELECT
            countIf(event_type == 'Upload'),
            countIf(remote_path == '{filename}'),
            countIf(bucket == '{bucket}'),
            count()
        FROM system.blob_storage_log WHERE query_id = '{insert_query_id}'"""
    )
    r = result.strip().split("\t")
    assert int(r[0]) >= 1, blob_storage_log
    assert all(col == r[0] for col in r), blob_storage_log
    run_query(instance, f"DROP TABLE {name}")


@pytest.mark.parametrize(
    "extension,method",
    [pytest.param("bin", "gzip", id="bin"), pytest.param("gz", "auto", id="gz")],
)
def test_storage_s3_put_gzip(started_cluster, extension, method):
    id = uuid.uuid4()
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]
    filename = f"{id}/test_put_gzip.{extension}"
    name = f"test_put_gzip_{extension}"
    data = [
        "'Joseph Tomlinson',5",
        "'Earnest Essary',44",
        "'Matha Pannell',24",
        "'Michael Shavers',46",
        "'Elias Groce',38",
        "'Pamela Bramlet',50",
        "'Lewis Harrell',49",
        "'Tamara Fyall',58",
        "'George Dixon',38",
        "'Alice Walls',49",
        "'Paula Mais',24",
        "'Myrtle Pelt',93",
        "'Sylvia Naffziger',18",
        "'Amanda Cave',83",
        "'Yolanda Joseph',89",
    ]
    run_query(
        instance,
        f"""CREATE TABLE {name} (name String, id UInt32) ENGINE = S3(
                                'http://{started_cluster.minio_ip}:{MINIO_INTERNAL_PORT}/{bucket}/{filename}',
                                'CSV',
                                '{method}')""",
    )

    run_query(instance, f"INSERT INTO {name} VALUES ({'),('.join(data)})")

    run_query(instance, f"SELECT sum(id) FROM {name}").splitlines() == ["708"]

    buf = io.BytesIO(
        get_s3_file_content(started_cluster, bucket, filename, decode=False)
    )
    f = gzip.GzipFile(fileobj=buf, mode="rb")
    uncompressed_content = f.read().decode()
    assert sum([int(i.split(",")[1]) for i in uncompressed_content.splitlines()]) == 708
    run_query(instance, f"DROP TABLE {name}")


def test_truncate_table(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    name = "truncate"

    instance.query(
        "CREATE TABLE {} (id UInt32) ENGINE = S3('http://{}:{}/{}/{}', 'CSV')".format(
            name, started_cluster.minio_ip, MINIO_INTERNAL_PORT, bucket, name
        )
    )

    instance.query("INSERT INTO {} SELECT number FROM numbers(10)".format(name))
    result = instance.query("SELECT * FROM {}".format(name))
    assert result == instance.query("SELECT number FROM numbers(10)")
    instance.query("TRUNCATE TABLE {}".format(name))

    minio = started_cluster.minio_client
    timeout = 30
    while timeout > 0:
        if (
            len(list(minio.list_objects(started_cluster.minio_bucket, "truncate/")))
            == 0
        ):
            break
        timeout -= 1
        time.sleep(1)
    assert len(list(minio.list_objects(started_cluster.minio_bucket, "truncate/"))) == 0
    # FIXME: there was a bug in test and it was never checked.
    # Currently read from truncated table fails with
    # DB::Exception: Failed to get object info: No response body..
    # HTTP response code: 404: while reading truncate: While executing S3Source
    # assert instance.query("SELECT * FROM {}".format(name)) == ""
    instance.query(f"DROP TABLE {name} SYNC")
    assert (
        instance.query(f"SELECT count() FROM system.tables where name='{name}'")
        == "0\n"
    )


def test_predefined_connection_configuration(started_cluster):
    id = uuid.uuid4()
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances[
        "dummy_without_named_collections"
    ]  # type: ClickHouseInstance
    name = "test_table"

    instance.query("CREATE USER user")
    instance.query("GRANT CREATE ON *.* TO user")
    instance.query("GRANT SOURCES ON *.* TO user")
    instance.query("GRANT SELECT ON *.* TO user")

    instance.query(f"drop table if exists {name}", user="user")
    error = instance.query_and_get_error(
        f"CREATE TABLE {name} (id UInt32) ENGINE = S3(s3_conf1, format='CSV')",
        user="user",
    )
    assert (
        "To execute this query, it's necessary to have the grant NAMED COLLECTION ON s3_conf1"
        in error
    )

    instance.query("GRANT NAMED COLLECTION ON s3_conf1 TO user", user="admin")
    instance.query(
        f"CREATE TABLE {name} (id UInt32) ENGINE = S3(s3_conf1, format='CSV')",
        user="user",
    )

    instance.query(
        f"INSERT INTO {name} SELECT number FROM numbers(10) SETTINGS s3_truncate_on_insert=1"
    )
    result = instance.query(f"SELECT * FROM {name}")
    assert result == instance.query("SELECT number FROM numbers(10)")

    result = instance.query(
        "SELECT * FROM s3(s3_conf1, format='CSV', structure='id UInt32')", user="user"
    )
    assert result == instance.query("SELECT number FROM numbers(10)")

    error = instance.query_and_get_error("SELECT * FROM s3(no_collection)", user="user")
    assert (
        "To execute this query, it's necessary to have the grant NAMED COLLECTION ON no_collection"
        in error
    )
    instance2 = started_cluster.instances["dummy"]  # has named collection access
    error = instance2.query_and_get_error("SELECT * FROM s3(no_collection)")
    assert "There is no named collection `no_collection`" in error
    instance.query("DROP USER user")
    instance.query(f"DROP TABLE {name}")


result = ""


def test_url_reconnect_in_the_middle(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]
    table_format = "id String, data String"
    filename = "test_url_reconnect_{}.tsv".format(random.randint(0, 1000))

    instance.query(
        f"""insert into table function
                   s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'TSV', '{table_format}')
                   select number, randomPrintableASCII(number % 1000) from numbers(1000000)"""
    )

    with PartitionManager() as pm:
        pm_rule_reject = {
            "probability": 0.02,
            "destination": instance.ip_address,
            "source_port": started_cluster.minio_port,
            "action": "REJECT --reject-with tcp-reset",
        }
        pm_rule_drop_all = {
            "destination": instance.ip_address,
            "source_port": started_cluster.minio_port,
            "action": "DROP",
        }
        pm._add_rule(pm_rule_reject)

        def select():
            global result
            result = instance.query(
                f"""select count(), sum(cityHash64(x)) from (select toUInt64(id) + sleep(0.1) as x from
                url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'TSV', '{table_format}')
                settings http_max_tries = 10, http_retry_max_backoff_ms=2000, http_send_timeout=1, http_receive_timeout=1)"""
            )
            assert result == "1000000\t3914219105369203805\n"

        thread = threading.Thread(target=select)
        thread.start()
        time.sleep(4)
        pm._add_rule(pm_rule_drop_all)

        time.sleep(2)
        pm._delete_rule(pm_rule_drop_all)
        pm._delete_rule(pm_rule_reject)

        thread.join()

        assert result == "1000000\t3914219105369203805\n"


# At the time of writing the actual read bytes are respectively 148 and 169, so -10% to not be flaky
@pytest.mark.parametrize(
    "format_name,expected_bytes_read", [("Parquet", 133), ("ORC", 150)]
)
def test_seekable_formats(started_cluster, format_name, expected_bytes_read):
    expected_lines = 1500000
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance

    table_function = f"s3(s3_{format_name.lower()}, structure='a Int32, b String', format='{format_name}')"
    exec_query_with_retry(
        instance,
        f"INSERT INTO TABLE FUNCTION {table_function} SELECT number, randomString(100) FROM numbers({expected_lines}) settings s3_truncate_on_insert=1",
        timeout=300,
    )

    result = instance.query(f"SELECT count() FROM {table_function}")
    assert int(result) == expected_lines

    result = instance.query(
        f"SELECT count() FROM {table_function} SETTINGS max_memory_usage='60M', max_download_threads=1"
    )
    assert int(result) == expected_lines

    instance.query(f"SELECT * FROM {table_function} FORMAT Null")

    instance.query("SYSTEM FLUSH LOGS")
    result = instance.query(
        f"SELECT formatReadableSize(ProfileEvents['ReadBufferFromS3Bytes']) FROM system.query_log WHERE startsWith(query, 'SELECT * FROM s3') AND memory_usage > 0 AND type='QueryFinish' ORDER BY event_time_microseconds DESC LIMIT 1"
    )
    result = result.strip()
    assert result.endswith("MiB")
    result = result[: result.index(".")]
    assert int(result) > 140


@pytest.mark.parametrize("format_name", ["Parquet", "ORC"])
def test_seekable_formats_url(started_cluster, format_name):
    bucket = started_cluster.minio_bucket
    expected_lines = 1500000
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance

    format_name_lower = format_name.lower()
    table_function = f"s3(s3_{format_name_lower}, structure='a Int32, b String', format='{format_name}')"
    exec_query_with_retry(
        instance,
        f"INSERT INTO TABLE FUNCTION {table_function} SELECT number, randomString(100) FROM numbers({expected_lines}) settings s3_truncate_on_insert=1",
        timeout=300,
    )

    result = instance.query(f"SELECT count() FROM {table_function}")
    assert int(result) == expected_lines

    url_function = f"url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_{format_name_lower}', '{format_name}', 'a Int32, b String')"
    result = instance.query(
        f"SELECT count() FROM {url_function} SETTINGS max_memory_usage='60M'"
    )
    assert int(result) == expected_lines


def test_empty_file(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]

    name = "empty"
    url = f"http://{started_cluster.minio_ip}:{MINIO_INTERNAL_PORT}/{bucket}/{name}"

    minio = started_cluster.minio_client
    minio.put_object(bucket, name, io.BytesIO(b""), 0)

    table_function = f"s3('{url}', 'CSV', 'id Int32')"
    result = instance.query(f"SELECT count() FROM {table_function}")
    assert int(result) == 0


def test_insert_with_path_with_globs(started_cluster):
    instance = started_cluster.instances["dummy"]

    table_function_3 = f"s3('http://minio1:9001/root/test_parquet*', 'minio', 'minio123', 'Parquet', 'a Int32, b String')"
    instance.query_and_get_error(
        f"insert into table function {table_function_3} SELECT number, randomString(100) FROM numbers(500)"
    )


def test_s3_schema_inference(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]

    instance.query(
        f"insert into table function s3(s3_native, structure='a Int32, b String', format='Native') select number, randomString(100) from numbers(5000000) SETTINGS s3_truncate_on_insert=1"
    )
    result = instance.query(f"desc s3(s3_native, format='Native')")
    assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n"

    result = instance.query(f"select count(*) from s3(s3_native, format='Native')")
    assert int(result) == 5000000

    instance.query(
        f"create table schema_inference engine=S3(s3_native, format='Native')"
    )
    result = instance.query(f"desc schema_inference")
    assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n"

    result = instance.query(f"select count(*) from schema_inference")
    assert int(result) == 5000000

    table_function = f"url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_native', 'Native')"
    result = instance.query(f"desc {table_function}")
    assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n"

    result = instance.query(f"select count(*) from {table_function}")
    assert int(result) == 5000000

    instance.query(
        f"create table schema_inference_2 engine=URL('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_native', 'Native')"
    )
    result = instance.query(f"desc schema_inference_2")
    assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n"

    result = instance.query(f"select count(*) from schema_inference_2")
    assert int(result) == 5000000

    table_function = f"s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_native', 'Native')"
    result = instance.query(f"desc {table_function}")
    assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n"

    result = instance.query(f"select count(*) from {table_function}")
    assert int(result) == 5000000

    instance.query("drop table schema_inference")
    instance.query("drop table schema_inference_2")


def test_empty_file(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]

    name = "empty"
    url = f"http://{started_cluster.minio_ip}:{MINIO_INTERNAL_PORT}/{bucket}/{name}"

    minio = started_cluster.minio_client
    minio.put_object(bucket, name, io.BytesIO(b""), 0)

    table_function = f"s3('{url}', 'CSV', 'id Int32')"
    result = instance.query(f"SELECT count() FROM {table_function}")
    assert int(result) == 0


def test_overwrite(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]

    table_function = f"s3(s3_parquet, structure='a Int32, b String', format='Parquet')"
    instance.query(f"create table test_overwrite as {table_function}")
    instance.query(f"truncate table test_overwrite")
    instance.query(
        f"insert into test_overwrite select number, randomString(100) from numbers(50) settings s3_truncate_on_insert=1"
    )
    instance.query_and_get_error(
        f"insert into test_overwrite select number, randomString(100) from numbers(100)"
    )
    instance.query(
        f"insert into test_overwrite select number, randomString(100) from numbers(200) settings s3_truncate_on_insert=1"
    )

    result = instance.query(f"select count() from test_overwrite")
    assert int(result) == 200
    instance.query(f"drop table test_overwrite")


def test_create_new_files_on_insert(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]

    table_function = f"s3(s3_parquet, structure='a Int32, b String', format='Parquet')"
    instance.query(f"create table test_multiple_inserts as {table_function}")
    instance.query(f"truncate table test_multiple_inserts")
    instance.query(
        f"insert into test_multiple_inserts select number, randomString(100) from numbers(10) settings s3_truncate_on_insert=1"
    )
    instance.query(
        f"insert into test_multiple_inserts select number, randomString(100) from numbers(20) settings s3_create_new_file_on_insert=1"
    )
    instance.query(
        f"insert into test_multiple_inserts select number, randomString(100) from numbers(30) settings s3_create_new_file_on_insert=1"
    )

    result = instance.query(f"select count() from test_multiple_inserts")
    assert int(result) == 60

    instance.query(f"drop table test_multiple_inserts")

    table_function = (
        f"s3(s3_parquet_gz, structure='a Int32, b String', format='Parquet')"
    )
    instance.query(f"create table test_multiple_inserts as {table_function}")
    instance.query(f"truncate table test_multiple_inserts")
    instance.query(
        f"insert into test_multiple_inserts select number, randomString(100) from numbers(10) settings s3_truncate_on_insert=1"
    )
    instance.query(
        f"insert into test_multiple_inserts select number, randomString(100) from numbers(20) settings s3_create_new_file_on_insert=1"
    )
    instance.query(
        f"insert into test_multiple_inserts select number, randomString(100) from numbers(30) settings s3_create_new_file_on_insert=1"
    )

    result = instance.query(f"select count() from test_multiple_inserts")
    assert int(result) == 60
    instance.query("drop table test_multiple_inserts")


def test_format_detection(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]

    instance.query(f"create table arrow_table_s3 (x UInt64) engine=S3(s3_arrow)")
    instance.query(
        f"insert into arrow_table_s3 select 1 settings s3_truncate_on_insert=1"
    )
    result = instance.query(f"select * from s3(s3_arrow)")
    assert int(result) == 1

    result = instance.query(
        f"select * from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.arrow')"
    )
    assert int(result) == 1

    result = instance.query(
        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.arrow')"
    )
    assert int(result) == 1

    instance.query(f"create table parquet_table_s3 (x UInt64) engine=S3(s3_parquet2)")
    instance.query(
        f"insert into parquet_table_s3 select 1 settings s3_truncate_on_insert=1"
    )
    result = instance.query(f"select * from s3(s3_parquet2)")
    assert int(result) == 1

    result = instance.query(
        f"select * from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.parquet')"
    )
    assert int(result) == 1

    result = instance.query(
        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.parquet')"
    )
    assert int(result) == 1
    instance.query(f"drop table arrow_table_s3")
    instance.query(f"drop table parquet_table_s3")


def test_schema_inference_from_globs(started_cluster):
    id = uuid.uuid4()
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]

    instance.query(
        f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test1.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select NULL"
    )
    instance.query(
        f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test2.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select 0"
    )

    url_filename = "test{1,2}.jsoncompacteachrow"
    result = instance.query(
        f"desc url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{url_filename}') settings input_format_json_infer_incomplete_types_as_strings=0"
    )
    assert result.strip() == "c1\tNullable(Int64)"

    result = instance.query(
        f"select * from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{url_filename}') settings input_format_json_infer_incomplete_types_as_strings=0"
    )
    assert sorted(result.split()) == ["0", "\\N"]

    result = instance.query(
        f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test*.jsoncompacteachrow') settings input_format_json_infer_incomplete_types_as_strings=0"
    )
    assert result.strip() == "c1\tNullable(Int64)"

    result = instance.query(
        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test*.jsoncompacteachrow') settings input_format_json_infer_incomplete_types_as_strings=0"
    )
    assert sorted(result.split()) == ["0", "\\N"]

    instance.query(
        f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test3.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select NULL"
    )

    url_filename = "test{1,3}.jsoncompacteachrow"

    result = instance.query_and_get_error(
        f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{url_filename}') settings schema_inference_use_cache_for_s3=0, input_format_json_infer_incomplete_types_as_strings=0"
    )

    assert "All attempts to extract table structure from files failed" in result

    result = instance.query_and_get_error(
        f"desc url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{url_filename}') settings schema_inference_use_cache_for_url=0, input_format_json_infer_incomplete_types_as_strings=0"
    )

    assert "All attempts to extract table structure from files failed" in result

    instance.query(
        f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test0.jsoncompacteachrow', 'TSV', 'x String') select '[123;]'"
    )

    result = instance.query_and_get_error(
        f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test*.jsoncompacteachrow') settings schema_inference_use_cache_for_s3=0, input_format_json_infer_incomplete_types_as_strings=0"
    )

    assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in result

    url_filename = "test{0,1,2,3}.jsoncompacteachrow"

    result = instance.query_and_get_error(
        f"desc url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{url_filename}') settings schema_inference_use_cache_for_url=0, input_format_json_infer_incomplete_types_as_strings=0"
    )

    assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in result


def test_signatures(started_cluster):
    session_token = "session token that will not be checked by MiniIO"
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]

    instance.query(f"create table test_signatures (x UInt64) engine=S3(s3_arrow)")
    instance.query(f"truncate table test_signatures")
    instance.query(f"insert into test_signatures select 1")

    result = instance.query(
        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.arrow')"
    )
    assert int(result) == 1

    result = instance.query(
        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.arrow', 'Arrow', 'x UInt64')"
    )
    assert int(result) == 1

    result = instance.query(
        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.arrow', 'minio', 'minio123')"
    )
    assert int(result) == 1

    error = instance.query_and_get_error(
        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.arrow', 'minio', 'minio123', '{session_token}')"
    )
    assert "S3_ERROR" in error

    result = instance.query(
        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.arrow', 'Arrow', 'x UInt64', 'auto')"
    )
    assert int(result) == 1

    result = instance.query(
        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.arrow', 'minio', 'minio123', 'Arrow')"
    )
    assert int(result) == 1

    error = instance.query_and_get_error(
        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.arrow', 'minio', 'minio123', '{session_token}', 'Arrow')"
    )
    assert "S3_ERROR" in error

    error = instance.query_and_get_error(
        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.arrow', 'minio', 'minio123', '{session_token}', 'Arrow', 'x UInt64')"
    )
    assert "S3_ERROR" in error

    error = instance.query_and_get_error(
        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.arrow', 'minio', 'minio123', '{session_token}', 'Arrow', 'x UInt64', 'auto')"
    )
    assert "S3_ERROR" in error

    instance.query(f"drop table test_signatures")


def test_select_columns(started_cluster):
    bucket = started_cluster.minio_bucket
    id = uuid.uuid4()
    instance = started_cluster.instances["dummy"]
    name = "test_table2"
    structure = "id UInt32, value1 Int32, value2 Int32"

    instance.query(f"drop table if exists {name}")
    instance.query(
        f"CREATE TABLE {name} ({structure}) ENGINE = S3(s3_conf1, format='Parquet')"
    )

    limit = 10000000
    instance.query(
        f"INSERT INTO {name} SELECT * FROM generateRandom('{structure}') LIMIT {limit} SETTINGS s3_truncate_on_insert=1"
    )
    instance.query(f"SELECT value2, '{id}' FROM {name}")

    instance.query("SYSTEM FLUSH LOGS")
    result1 = instance.query(
        f"SELECT ProfileEvents['ReadBufferFromS3Bytes'] FROM system.query_log WHERE type='QueryFinish' and query LIKE 'SELECT value2, ''{id}'' FROM {name}'"
    )

    instance.query(f"SELECT *, '{id}' FROM {name}")
    instance.query("SYSTEM FLUSH LOGS")
    result2 = instance.query(
        f"SELECT ProfileEvents['ReadBufferFromS3Bytes'] FROM system.query_log WHERE type='QueryFinish' and query LIKE 'SELECT *, ''{id}'' FROM {name}'"
    )

    assert round(int(result2) / int(result1)) == 3


def test_insert_select_schema_inference(started_cluster):
    id = uuid.uuid4()
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]

    instance.query(
        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test_insert_select.native') select toUInt64(1) as x"
    )
    result = instance.query(
        f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test_insert_select.native')"
    )
    assert result.strip() == "x\tUInt64"

    result = instance.query(
        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test_insert_select.native')"
    )
    assert int(result) == 1


def test_parallel_reading_with_memory_limit(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]

    instance.query(
        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_memory_limit.native') select * from numbers(1000000) SETTINGS s3_truncate_on_insert=1"
    )

    result = instance.query_and_get_error(
        f"select * from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_memory_limit.native') settings max_memory_usage=1000"
    )

    assert "Memory limit (for query) exceeded" in result

    time.sleep(5)

    # Check that server didn't crash
    result = instance.query("select 1")
    assert int(result) == 1


def test_wrong_format_usage(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]

    instance.query(
        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_wrong_format.native') select * from numbers(10e6) SETTINGS s3_truncate_on_insert=1"
    )
    # size(test_wrong_format.native) = 10e6*8+16(header) ~= 76MiB

    # ensure that not all file will be loaded into memory
    result = instance.query_and_get_error(
        f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_wrong_format.native', 'Parquet') settings input_format_allow_seeks=0, max_memory_usage='10Mi'"
    )

    assert "Not a Parquet file" in result


def check_profile_event_for_query(
    instance, file, storage_name, started_cluster, bucket, profile_event, amount
):
    instance.query("system flush logs")
    query_pattern = f"{storage_name}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file}'".replace(
        "'", "\\'"
    )
    res = int(
        instance.query(
            f"select ProfileEvents['{profile_event}'] from system.query_log where query like '%{query_pattern}%' and query not like '%ProfileEvents%' and type = 'QueryFinish' order by query_start_time_microseconds desc limit 1"
        )
    )

    assert res == amount


def check_cache_misses(instance, file, storage_name, started_cluster, bucket, amount=1):
    check_profile_event_for_query(
        instance,
        file,
        storage_name,
        started_cluster,
        bucket,
        "SchemaInferenceCacheMisses",
        amount,
    )


def check_cache_hits(instance, file, storage_name, started_cluster, bucket, amount=1):
    check_profile_event_for_query(
        instance,
        file,
        storage_name,
        started_cluster,
        bucket,
        "SchemaInferenceCacheHits",
        amount,
    )


def check_cache_invalidations(
    instance, file, storage_name, started_cluster, bucket, amount=1
):
    check_profile_event_for_query(
        instance,
        file,
        storage_name,
        started_cluster,
        bucket,
        "SchemaInferenceCacheInvalidations",
        amount,
    )


def check_cache_evictions(
    instance, file, storage_name, started_cluster, bucket, amount=1
):
    check_profile_event_for_query(
        instance,
        file,
        storage_name,
        started_cluster,
        bucket,
        "SchemaInferenceCacheEvictions",
        amount,
    )


def check_cahce_num_rows_hits(
    instance, file, storage_name, started_cluster, bucket, amount=1
):
    check_profile_event_for_query(
        instance,
        file,
        storage_name,
        started_cluster,
        bucket,
        "SchemaInferenceCacheNumRowsHits",
        amount,
    )


def run_describe_query(instance, file, storage_name, started_cluster, bucket):
    query = f"desc {storage_name}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file}')"
    instance.query(query)


def run_count_query(instance, file, storage_name, started_cluster, bucket):
    query = f"select count() from {storage_name}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file}', auto, 'x UInt64')"
    return instance.query(query)


def check_cache(instance, expected_files):
    sources = instance.query("select source from system.schema_inference_cache")
    assert sorted(map(lambda x: x.strip().split("/")[-1], sources.split())) == sorted(
        expected_files
    )


def test_schema_inference_cache(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]

    def test(storage_name):
        instance.query("system drop schema cache")
        instance.query(
            f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_cache0.jsonl') select * from numbers(100) settings s3_truncate_on_insert=1"
        )
        time.sleep(1)

        run_describe_query(
            instance, "test_cache0.jsonl", storage_name, started_cluster, bucket
        )
        check_cache(instance, ["test_cache0.jsonl"])
        check_cache_misses(
            instance, "test_cache0.jsonl", storage_name, started_cluster, bucket
        )

        run_describe_query(
            instance, "test_cache0.jsonl", storage_name, started_cluster, bucket
        )
        check_cache_hits(
            instance, "test_cache0.jsonl", storage_name, started_cluster, bucket
        )

        instance.query(
            f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_cache0.jsonl') select * from numbers(100) settings s3_truncate_on_insert=1"
        )
        time.sleep(1)

        run_describe_query(
            instance, "test_cache0.jsonl", storage_name, started_cluster, bucket
        )
        check_cache_invalidations(
            instance, "test_cache0.jsonl", storage_name, started_cluster, bucket
        )

        instance.query(
            f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_cache1.jsonl') select * from numbers(100) settings s3_truncate_on_insert=1"
        )
        time.sleep(1)

        run_describe_query(
            instance, "test_cache1.jsonl", storage_name, started_cluster, bucket
        )
        check_cache(instance, ["test_cache0.jsonl", "test_cache1.jsonl"])
        check_cache_misses(
            instance, "test_cache1.jsonl", storage_name, started_cluster, bucket
        )

        run_describe_query(
            instance, "test_cache1.jsonl", storage_name, started_cluster, bucket
        )
        check_cache_hits(
            instance, "test_cache1.jsonl", storage_name, started_cluster, bucket
        )

        instance.query(
            f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_cache2.jsonl') select * from numbers(100) settings s3_truncate_on_insert=1"
        )
        time.sleep(1)

        run_describe_query(
            instance, "test_cache2.jsonl", storage_name, started_cluster, bucket
        )
        check_cache(instance, ["test_cache1.jsonl", "test_cache2.jsonl"])
        check_cache_misses(
            instance, "test_cache2.jsonl", storage_name, started_cluster, bucket
        )
        check_cache_evictions(
            instance, "test_cache2.jsonl", storage_name, started_cluster, bucket
        )

        run_describe_query(
            instance, "test_cache2.jsonl", storage_name, started_cluster, bucket
        )
        check_cache_hits(
            instance, "test_cache2.jsonl", storage_name, started_cluster, bucket
        )

        run_describe_query(
            instance, "test_cache1.jsonl", storage_name, started_cluster, bucket
        )
        check_cache_hits(
            instance, "test_cache1.jsonl", storage_name, started_cluster, bucket
        )

        run_describe_query(
            instance, "test_cache0.jsonl", storage_name, started_cluster, bucket
        )
        check_cache(instance, ["test_cache0.jsonl", "test_cache1.jsonl"])
        check_cache_misses(
            instance, "test_cache0.jsonl", storage_name, started_cluster, bucket
        )
        check_cache_evictions(
            instance, "test_cache0.jsonl", storage_name, started_cluster, bucket
        )

        run_describe_query(
            instance, "test_cache2.jsonl", storage_name, started_cluster, bucket
        )
        check_cache(instance, ["test_cache0.jsonl", "test_cache2.jsonl"])
        check_cache_misses(
            instance, "test_cache2.jsonl", storage_name, started_cluster, bucket
        )
        check_cache_evictions(
            instance, "test_cache2.jsonl", storage_name, started_cluster, bucket
        )

        run_describe_query(
            instance, "test_cache2.jsonl", storage_name, started_cluster, bucket
        )
        check_cache_hits(
            instance, "test_cache2.jsonl", storage_name, started_cluster, bucket
        )

        run_describe_query(
            instance, "test_cache0.jsonl", storage_name, started_cluster, bucket
        )
        check_cache_hits(
            instance, "test_cache0.jsonl", storage_name, started_cluster, bucket
        )

        instance.query(
            f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_cache3.jsonl') select * from numbers(100) settings s3_truncate_on_insert=1"
        )
        time.sleep(1)

        files = "test_cache{0,1,2,3}.jsonl"
        run_describe_query(instance, files, storage_name, started_cluster, bucket)
        check_cache_hits(instance, files, storage_name, started_cluster, bucket)

        instance.query(f"system drop schema cache for {storage_name}")
        check_cache(instance, [])

        run_describe_query(instance, files, storage_name, started_cluster, bucket)
        check_cache_misses(instance, files, storage_name, started_cluster, bucket, 4)

        instance.query("system drop schema cache")
        check_cache(instance, [])

        run_describe_query(instance, files, storage_name, started_cluster, bucket)
        check_cache_misses(instance, files, storage_name, started_cluster, bucket, 4)

        instance.query("system drop schema cache")

        instance.query(
            f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_cache0.csv') select * from numbers(100) settings s3_truncate_on_insert=1"
        )
        time.sleep(1)

        res = run_count_query(
            instance, "test_cache0.csv", storage_name, started_cluster, bucket
        )

        assert int(res) == 100

        check_cache(instance, ["test_cache0.csv"])
        check_cache_misses(
            instance, "test_cache0.csv", storage_name, started_cluster, bucket
        )

        res = run_count_query(
            instance, "test_cache0.csv", storage_name, started_cluster, bucket
        )
        assert int(res) == 100

        check_cache_hits(
            instance, "test_cache0.csv", storage_name, started_cluster, bucket
        )

        instance.query(
            f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_cache0.csv') select * from numbers(200) settings s3_truncate_on_insert=1"
        )
        time.sleep(1)

        res = run_count_query(
            instance, "test_cache0.csv", storage_name, started_cluster, bucket
        )

        assert int(res) == 200

        check_cache_invalidations(
            instance, "test_cache0.csv", storage_name, started_cluster, bucket
        )

        instance.query(
            f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_cache1.csv') select * from numbers(100) settings s3_truncate_on_insert=1"
        )
        time.sleep(1)

        res = run_count_query(
            instance, "test_cache1.csv", storage_name, started_cluster, bucket
        )

        assert int(res) == 100
        check_cache(instance, ["test_cache0.csv", "test_cache1.csv"])
        check_cache_misses(
            instance, "test_cache1.csv", storage_name, started_cluster, bucket
        )

        res = run_count_query(
            instance, "test_cache1.csv", storage_name, started_cluster, bucket
        )
        assert int(res) == 100
        check_cache_hits(
            instance, "test_cache1.csv", storage_name, started_cluster, bucket
        )

        res = run_count_query(
            instance, "test_cache{0,1}.csv", storage_name, started_cluster, bucket
        )
        assert int(res) == 300
        check_cache_hits(
            instance, "test_cache{0,1}.csv", storage_name, started_cluster, bucket, 2
        )

        instance.query(f"system drop schema cache for {storage_name}")
        check_cache(instance, [])

        res = run_count_query(
            instance, "test_cache{0,1}.csv", storage_name, started_cluster, bucket
        )
        assert int(res) == 300
        check_cache_misses(
            instance, "test_cache{0,1}.csv", storage_name, started_cluster, bucket, 2
        )

        instance.query(f"system drop schema cache for {storage_name}")
        check_cache(instance, [])

        instance.query(
            f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_cache.parquet') select * from numbers(100) settings s3_truncate_on_insert=1"
        )
        time.sleep(1)

        res = instance.query(
            f"select count() from {storage_name}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_cache.parquet')"
        )
        assert int(res) == 100
        check_cache_misses(
            instance, "test_cache.parquet", storage_name, started_cluster, bucket
        )
        check_cache_hits(
            instance, "test_cache.parquet", storage_name, started_cluster, bucket
        )
        check_cahce_num_rows_hits(
            instance, "test_cache.parquet", storage_name, started_cluster, bucket
        )

    test("s3")
    test("url")


def test_ast_auth_headers(started_cluster):
    bucket = started_cluster.minio_restricted_bucket
    instance = started_cluster.instances["s3_non_default"]  # type: ClickHouseInstance
    filename = "test.csv"

    result = instance.query_and_get_error(
        f"select count() from s3('http://resolver:8080/{bucket}/{filename}', 'CSV', 'dummy String')"
    )

    assert "HTTP response code: 403" in result
    assert "S3_ERROR" in result

    result = instance.query(
        f"select * from s3('http://resolver:8080/{bucket}/{filename}', 'CSV', headers(Authorization=`Bearer TOKEN`))"
    )

    assert result.strip() == "1\t2\t3"


def test_environment_credentials(started_cluster):
    bucket = started_cluster.minio_restricted_bucket

    instance = started_cluster.instances["s3_with_environment_credentials"]
    instance.query(
        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_cache3.jsonl') select * from numbers(100) settings s3_truncate_on_insert=1"
    )
    assert (
        "100"
        == instance.query(
            f"select count() from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_cache3.jsonl')"
        ).strip()
    )

    # manually defined access key should override from env
    with pytest.raises(helpers.client.QueryRuntimeException) as ei:
        instance.query(
            f"select count() from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_cache4.jsonl', 'aws', 'aws123')"
        )

        assert ei.value.returncode == 243
        assert "HTTP response code: 403" in ei.value.stderr


def test_s3_list_objects_failure(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    filename = "test_no_list_{_partition_id}.csv"

    put_query = f"""
        INSERT INTO TABLE FUNCTION
        s3('http://resolver:8083/{bucket}/{filename}', 'CSV', 'c1 UInt32')
        PARTITION BY c1 % 20
        SELECT number FROM numbers(100)
        SETTINGS s3_truncate_on_insert=1
    """

    run_query(instance, put_query)

    T = 10
    for _ in range(0, T):
        started_cluster.exec_in_container(
            started_cluster.get_container_id("resolver"),
            [
                "curl",
                "-X",
                "POST",
                f"http://localhost:8083/reset_counters?max={random.randint(1, 15)}",
            ],
        )

        get_query = """
            SELECT sleep({seconds}) FROM s3('http://resolver:8083/{bucket}/test_no_list_*', 'CSV', 'c1 UInt32')
            SETTINGS s3_list_object_keys_size = 1, max_threads = {max_threads}, enable_s3_requests_logging = 1
            """.format(
            bucket=bucket, seconds=random.random(), max_threads=random.randint(2, 20)
        )

        with pytest.raises(helpers.client.QueryRuntimeException) as ei:
            result = run_query(instance, get_query)
            print(result)

        assert ei.value.returncode == 243
        assert "Could not list objects" in ei.value.stderr


def test_skip_empty_files(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]

    instance.query(
        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files1.parquet', TSVRaw) select * from numbers(0) settings s3_truncate_on_insert=1"
    )

    instance.query(
        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files2.parquet') select * from numbers(1) settings s3_truncate_on_insert=1"
    )

    def test(engine, setting):
        instance.query_and_get_error(
            f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files1.parquet') settings {setting}=0"
        )

        instance.query_and_get_error(
            f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files1.parquet', auto, 'number UINt64') settings {setting}=0"
        )

        instance.query_and_get_error(
            f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files1.parquet') settings {setting}=1"
        )

        res = instance.query(
            f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files1.parquet', auto, 'number UInt64') settings {setting}=1"
        )

        assert len(res) == 0

        instance.query_and_get_error(
            f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files{{1,2}}.parquet') settings {setting}=0"
        )

        instance.query_and_get_error(
            f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files{{1,2}}.parquet', auto, 'number UInt64') settings {setting}=0"
        )

        res = instance.query(
            f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files{{1,2}}.parquet') settings {setting}=1"
        )

        assert int(res) == 0

        res = instance.query(
            f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files{{1,2}}.parquet', auto, 'number UInt64') settings {setting}=1"
        )

        assert int(res) == 0

    test("s3", "s3_skip_empty_files")
    test("url", "engine_url_skip_empty_files")

    res = instance.query(
        f"select * from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files{{1|2}}.parquet') settings engine_url_skip_empty_files=1"
    )

    assert int(res) == 0

    res = instance.query(
        f"select * from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files{{11|1|22}}.parquet', auto, 'number UInt64') settings engine_url_skip_empty_files=1"
    )

    assert len(res.strip()) == 0


def test_read_subcolumns(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]

    instance.query(
        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select  ((1, 2), 3) SETTINGS s3_truncate_on_insert=1"
    )

    instance.query(
        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.jsonl', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select  ((1, 2), 3)  SETTINGS s3_truncate_on_insert=1"
    )

    res = instance.query(
        f"select a.b.d, _path, a.b, _file, a.e from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')"
    )

    assert res == "2\troot/test_subcolumns.tsv\t(1,2)\ttest_subcolumns.tsv\t3\n"

    res = instance.query(
        f"select a.b.d, _path, a.b, _file, a.e from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.jsonl', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')"
    )

    assert res == "2\troot/test_subcolumns.jsonl\t(1,2)\ttest_subcolumns.jsonl\t3\n"

    res = instance.query(
        f"select x.b.d, _path, x.b, _file, x.e from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.jsonl', auto, 'x Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')"
    )

    assert res == "0\troot/test_subcolumns.jsonl\t(0,0)\ttest_subcolumns.jsonl\t0\n"

    res = instance.query(
        f"select x.b.d, _path, x.b, _file, x.e from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.jsonl', auto, 'x Tuple(b Tuple(c UInt32, d UInt32), e UInt32) default ((42, 42), 42)')"
    )

    assert res == "42\troot/test_subcolumns.jsonl\t(42,42)\ttest_subcolumns.jsonl\t42\n"

    res = instance.query(
        f"select a.b.d, _path, a.b, _file, a.e from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')"
    )

    assert res == "2\t/root/test_subcolumns.tsv\t(1,2)\ttest_subcolumns.tsv\t3\n"

    res = instance.query(
        f"select a.b.d, _path, a.b, _file, a.e from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.jsonl', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')"
    )

    assert res == "2\t/root/test_subcolumns.jsonl\t(1,2)\ttest_subcolumns.jsonl\t3\n"

    res = instance.query(
        f"select x.b.d, _path, x.b, _file, x.e from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.jsonl', auto, 'x Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')"
    )

    assert res == "0\t/root/test_subcolumns.jsonl\t(0,0)\ttest_subcolumns.jsonl\t0\n"

    res = instance.query(
        f"select x.b.d, _path, x.b, _file, x.e from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.jsonl', auto, 'x Tuple(b Tuple(c UInt32, d UInt32), e UInt32) default ((42, 42), 42)')"
    )

    assert (
        res == "42\t/root/test_subcolumns.jsonl\t(42,42)\ttest_subcolumns.jsonl\t42\n"
    )


def test_read_subcolumn_time(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]

    instance.query(
        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumn_time.tsv', auto, 'a UInt32') select  (42)  SETTINGS s3_truncate_on_insert=1"
    )

    res = instance.query(
        f"select a, dateDiff('minute', _time, now()) < 59 from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumn_time.tsv', auto, 'a UInt32')"
    )

    assert res == "42\t1\n"


def test_filtering_by_file_or_path(started_cluster):
    id = uuid.uuid4()
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]

    instance.query(
        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter1.tsv', auto, 'x UInt64') select 1 SETTINGS s3_truncate_on_insert=1"
    )

    instance.query(
        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter2.tsv', auto, 'x UInt64') select 2 SETTINGS s3_truncate_on_insert=1"
    )

    instance.query(
        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter3.tsv', auto, 'x UInt64') select 3 SETTINGS s3_truncate_on_insert=1"
    )

    instance.query(
        f"select count(), '{id}' from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter*.tsv') where _file = 'test_filter1.tsv'"
    )

    instance.query("SYSTEM FLUSH LOGS")

    result = instance.query(
        f"SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query like '%{id}%' AND type='QueryFinish'"
    )

    assert int(result) == 1

    assert 0 == int(
        instance.query(
            f"select count() from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter*.tsv') where _file = 'kek'"
        )
    )


def test_union_schema_inference_mode(started_cluster):
    id = uuid.uuid4()
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["s3_non_default"]
    file_name_prefix = f"test_union_schema_inference_{id}_"

    instance.query(
        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}1.jsonl') select 1 as a SETTINGS s3_truncate_on_insert=1"
    )

    instance.query(
        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}2.jsonl') select 2 as b SETTINGS s3_truncate_on_insert=1"
    )

    instance.query(
        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}3.jsonl') select 2 as c SETTINGS s3_truncate_on_insert=1"
    )

    instance.query(
        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}4.jsonl', TSV) select 'Error' SETTINGS s3_truncate_on_insert=1"
    )

    for engine in ["s3", "url"]:
        instance.query("system drop schema cache for s3")

        result = instance.query(
            f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}{{1,2,3}}.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV"
        )
        assert result == "a\tNullable(Int64)\nb\tNullable(Int64)\nc\tNullable(Int64)\n"

        result = instance.query(
            f"select schema_inference_mode, splitByChar('/', source)[-1] as file, schema from system.schema_inference_cache where source like '%{file_name_prefix}%' order by file format TSV"
        )
        assert (
            result == f"UNION\t{file_name_prefix}1.jsonl\ta Nullable(Int64)\n"
            f"UNION\t{file_name_prefix}2.jsonl\tb Nullable(Int64)\n"
            f"UNION\t{file_name_prefix}3.jsonl\tc Nullable(Int64)\n"
        )
        result = instance.query(
            f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}{{1,2,3}}.jsonl') order by tuple(*) settings schema_inference_mode='union', describe_compact_output=1 format TSV"
        )
        assert result == "1\t\\N\t\\N\n" "\\N\t2\t\\N\n" "\\N\t\\N\t2\n"

        instance.query(f"system drop schema cache for {engine}")
        result = instance.query(
            f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}2.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV"
        )
        assert result == "b\tNullable(Int64)\n"

        result = instance.query(
            f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}{{1,2,3}}.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV"
        )
        assert (
            result == "a\tNullable(Int64)\n"
            "b\tNullable(Int64)\n"
            "c\tNullable(Int64)\n"
        )

        error = instance.query_and_get_error(
            f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}{{1,2,3,4}}.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV"
        )
        assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in error


def test_s3_format_detection(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]

    instance.query(
        f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection0', 'JSONEachRow', 'x UInt64, y String') select number, 'str_' || toString(number) from numbers(0) settings s3_truncate_on_insert=1"
    )

    instance.query(
        f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection1', 'JSONEachRow', 'x UInt64, y String') select number, 'str_' || toString(number) from numbers(5) settings s3_truncate_on_insert=1"
    )

    expected_result = instance.query(
        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection1', 'JSONEachRow', 'x UInt64, y String')"
    )

    expected_desc_result = instance.query(
        f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection1', 'JSONEachRow')"
    )

    for engine in ["s3", "url"]:
        desc_result = instance.query(
            f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection1')"
        )

        assert desc_result == expected_desc_result

        result = instance.query(
            f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection1')"
        )

        assert result == expected_result

        result = instance.query(
            f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection1', auto, 'x UInt64, y String')"
        )

        assert result == expected_result

        result = instance.query(
            f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection{{0,1}}', auto, 'x UInt64, y String')"
        )

        assert result == expected_result

        instance.query(f"system drop schema cache for {engine}")

        result = instance.query(
            f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection{{0,1}}', auto, 'x UInt64, y String')"
        )

        assert result == expected_result


def test_respect_object_existence_on_partitioned_write(started_cluster):
    bucket = started_cluster.minio_bucket
    instance = started_cluster.instances["dummy"]

    instance.query(
        f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_partitioned_write42.csv', 'CSV', 'x UInt64') select 42 settings s3_truncate_on_insert=1"
    )

    result = instance.query(
        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_partitioned_write42.csv')"
    )

    assert int(result) == 42

    error = instance.query_and_get_error(
        f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_partitioned_write{{_partition_id}}.csv', 'CSV', 'x UInt64') partition by 42 select 42 settings s3_truncate_on_insert=0"
    )

    assert "BAD_ARGUMENTS" in error

    instance.query(
        f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_partitioned_write{{_partition_id}}.csv', 'CSV', 'x UInt64') partition by 42 select 43 settings s3_truncate_on_insert=1"
    )

    result = instance.query(
        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_partitioned_write42.csv')"
    )

    assert int(result) == 43

    instance.query(
        f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_partitioned_write{{_partition_id}}.csv', 'CSV', 'x UInt64') partition by 42 select 44 settings s3_truncate_on_insert=0, s3_create_new_file_on_insert=1"
    )

    result = instance.query(
        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_partitioned_write42.1.csv')"
    )

    assert int(result) == 44
-												Added test for GZIP in S3 storage.

											
										
										
											2020-09-28 23:30:41 +00:00
+								import gzip
-												Implement system table blob_storage_log

											
										
										
											2023-11-07 10:03:57 +00:00
+								import uuid
-												Tests decomposition.

											
										
										
											2019-09-19 09:34:33 +00:00
+								import logging
-												Format and optimize imports in integration test files

This PR formats all the `*.py` files found under the `tests/integration`
folder. It also reorders the imports and cleans up a bunch of unused
imports.

The formatting also takes care of other things like wrapping lines and
fixing spaces and indents such that the tests look more readable.

											
										
										
											2020-09-16 04:26:10 +00:00
+								import os
-												Convert to python3 (#15007)


											
										
										
											2020-10-02 16:54:07 +00:00
+								import io
-												Added some tests.

											
										
										
											2020-01-27 21:44:18 +00:00
+								import random
-												Added test for multi-page S3 globbing.

											
										
										
											2020-05-25 09:15:11 +00:00
+								import threading
-												Fixed bug in GZIP compression in S3 storage.

											
										
										
											2020-09-30 13:09:55 +00:00
+								import time
-												Tests decomposition.

											
										
										
											2019-09-19 09:34:33 +00:00
-												Format and optimize imports in integration test files

This PR formats all the `*.py` files found under the `tests/integration`
folder. It also reorders the imports and cleans up a bunch of unused
imports.

The formatting also takes care of other things like wrapping lines and
fixing spaces and indents such that the tests look more readable.

											
										
										
											2020-09-16 04:26:10 +00:00
+								import helpers.client
-												Attempt to make integration tests.

											
										
										
											2019-06-26 00:41:14 +00:00
+								import pytest
-												improvements

											
										
										
											2022-07-07 20:19:15 +00:00
+								from helpers.cluster import ClickHouseCluster, ClickHouseInstance
-												add test

											
										
										
											2021-11-09 20:11:02 +00:00
+								from helpers.network import PartitionManager
-												Move common code to helpers/mock_servers.py

											
										
										
											2023-01-02 12:51:17 +00:00
+								from helpers.mock_servers import start_mock_servers
-												Fix test

											
										
										
											2021-12-22 10:57:08 +00:00
+								from helpers.test_tools import exec_query_with_retry
-												Add spark to tests, rewrite tests, fix bug

											
										
										
											2023-03-07 15:04:21 +00:00
+								from helpers.s3_tools import prepare_s3_bucket
-												Tests decomposition.

											
										
										
											2019-09-19 09:34:33 +00:00
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								MINIO_INTERNAL_PORT = 9001
-												Tests decomposition.

											
										
										
											2019-09-19 09:34:33 +00:00
-												recreate S3 client if credentials changed

											
										
										
											2021-03-04 15:56:55 +00:00
+								SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
-												Fix some flaky tests

											
										
										
											2021-06-21 08:02:27 +00:00
-												Fix style check

											
										
										
											2023-03-28 19:57:14 +00:00
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								def put_s3_file_content(started_cluster, bucket, filename, data):
-												Convert to python3 (#15007)


											
										
										
											2020-10-02 16:54:07 +00:00
+								    buf = io.BytesIO(data)
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    started_cluster.minio_client.put_object(bucket, filename, buf, len(data))
-												Added test for GZIP in S3 storage.

											
										
										
											2020-09-28 23:30:41 +00:00
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								# Returns content of given S3 file as string.
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								def get_s3_file_content(started_cluster, bucket, filename, decode=True):
-												more

											
										
										
											2021-06-02 15:08:16 +00:00
+								    # type: (ClickHouseCluster, str, str, bool) -> str
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    data = started_cluster.minio_client.get_object(bucket, filename)
-												Convert to python3 (#15007)


											
										
										
											2020-10-02 16:54:07 +00:00
+								    data_str = b""
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								    for chunk in data.stream():
 								        data_str += chunk
-												Convert to python3 (#15007)


											
										
										
											2020-10-02 16:54:07 +00:00
+								    if decode:
 								        return data_str.decode()
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								    return data_str
-												Tests decomposition.

											
										
										
											2019-09-19 09:34:33 +00:00
-												Attempt to make integration tests.

											
										
										
											2019-06-26 00:41:14 +00:00
+								@pytest.fixture(scope="module")
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								def started_cluster():
-												Attempt to make integration tests.

											
										
										
											2019-06-26 00:41:14 +00:00
+								    try:
 								        cluster = ClickHouseCluster(__file__)
-												Add default credentials and custom headers for s3 table functions.

											
										
										
											2020-06-01 17:16:09 +00:00
+								        cluster.add_instance(
 								            "restricted_dummy",
 								            main_configs=["configs/config_for_test_remote_host_filter.xml"],
-												Tests, url table function

											
										
										
											2021-09-08 19:28:22 +00:00
+								            with_minio=True,
 								        )
 								        cluster.add_instance(
 								            "dummy",
-												Add default credentials and custom headers for s3 table functions.

											
										
										
											2020-06-01 17:16:09 +00:00
+								            with_minio=True,
-												Fix check black

											
										
										
											2022-08-11 10:55:18 +00:00
+								            main_configs=[
 								                "configs/defaultS3.xml",
 								                "configs/named_collections.xml",
 								                "configs/schema_cache.xml",
-												Implement system table blob_storage_log

											
										
										
											2023-11-07 10:03:57 +00:00
+								                "configs/blob_log.xml",
-												Fix check black

											
										
										
											2022-08-11 10:55:18 +00:00
+								            ],
-												Update test.py

											
										
										
											2023-10-18 15:30:59 +00:00
+								            user_configs=[
 								                "configs/access.xml",
 								                "configs/users.xml",
 								                "configs/s3_retry.xml",
 								            ],
-												Apply black formatter to all *.py files in the repo

											
										
										
											2022-03-22 16:39:58 +00:00
+								        )
-												Better

											
										
										
											2023-06-13 10:40:53 +00:00
+								        cluster.add_instance(
 								            "dummy_without_named_collections",
 								            with_minio=True,
 								            main_configs=[
 								                "configs/defaultS3.xml",
 								                "configs/named_collections.xml",
 								                "configs/schema_cache.xml",
 								            ],
 								            user_configs=["configs/access.xml"],
-												Apply black formatter to all *.py files in the repo

											
										
										
											2022-03-22 16:39:58 +00:00
+								        )
-												recreate S3 client if credentials changed

											
										
										
											2021-03-04 15:56:55 +00:00
+								        cluster.add_instance(
 								            "s3_max_redirects",
 								            with_minio=True,
 								            main_configs=["configs/defaultS3.xml"],
-												fix tests, to avoid timeouts do less retries

											
										
										
											2023-10-18 14:36:45 +00:00
+								            user_configs=["configs/s3_max_redirects.xml", "configs/s3_retry.xml"],
-												recreate S3 client if credentials changed

											
										
										
											2021-03-04 15:56:55 +00:00
+								        )
-												s3 header auth in ast

											
										
										
											2022-09-13 13:07:43 +00:00
+								        cluster.add_instance(
 								            "s3_non_default",
 								            with_minio=True,
 								        )
-												Fix use_environment_credentials

											
										
										
											2022-09-30 10:24:56 +00:00
+								        cluster.add_instance(
 								            "s3_with_environment_credentials",
 								            with_minio=True,
 								            env_variables={
 								                "AWS_ACCESS_KEY_ID": "minio",
 								                "AWS_SECRET_ACCESS_KEY": "minio123",
 								            },
 								            main_configs=["configs/use_environment_credentials.xml"],
 								        )
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								        logging.info("Starting cluster...")
-												Attempt to make integration tests.

											
										
										
											2019-06-26 00:41:14 +00:00
+								        cluster.start()
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								        logging.info("Cluster started")
-												Tests decomposition.

											
										
										
											2019-09-19 09:34:33 +00:00
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								        prepare_s3_bucket(cluster)
 								        logging.info("S3 bucket created")
-												Added tests.

											
										
										
											2021-04-12 08:55:54 +00:00
+								        run_s3_mocks(cluster)
-												Tests decomposition.

											
										
										
											2019-09-19 09:34:33 +00:00
-												Attempt to make integration tests.

											
										
										
											2019-06-26 00:41:14 +00:00
+								        yield cluster
 								    finally:
 								        cluster.shutdown()
-												Implement system table blob_storage_log

											
										
										
											2023-11-07 10:03:57 +00:00
+								def run_query(instance, query, *args, **kwargs):
-												Minor review fixes.

											
										
										
											2019-09-22 10:42:47 +00:00
+								    logging.info("Running query '{}'...".format(query))
-												Implement system table blob_storage_log

											
										
										
											2023-11-07 10:03:57 +00:00
+								    result = instance.query(query, *args, **kwargs)
-												Minor review fixes.

											
										
										
											2019-09-22 10:42:47 +00:00
+								    logging.info("Query finished")
-												Attempt to make integration tests.

											
										
										
											2019-06-26 00:41:14 +00:00
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								    return result
-												Minor review fixes.

											
										
										
											2019-09-22 10:42:47 +00:00
-												Improved test.

											
										
										
											2021-03-02 16:53:03 +00:00
+								# Test simple put. Also checks that wrong credentials produce an error with every compression method.
-												Added test for put with compression and wrong credentials.

											
										
										
											2021-03-02 02:43:19 +00:00
+								@pytest.mark.parametrize(
 								    "maybe_auth,positive,compression",
 								    [
-												wip

											
										
										
											2021-04-12 07:03:12 +00:00
+								        pytest.param("", True, "auto", id="positive"),
 								        pytest.param("'minio','minio123',", True, "auto", id="auth_positive"),
-												better

											
										
										
											2021-04-29 14:26:41 +00:00
+								        pytest.param("'wrongid','wrongkey',", False, "auto", id="auto"),
 								        pytest.param("'wrongid','wrongkey',", False, "gzip", id="gzip"),
 								        pytest.param("'wrongid','wrongkey',", False, "deflate", id="deflate"),
 								        pytest.param("'wrongid','wrongkey',", False, "brotli", id="brotli"),
 								        pytest.param("'wrongid','wrongkey',", False, "xz", id="xz"),
 								        pytest.param("'wrongid','wrongkey',", False, "zstd", id="zstd"),
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
+								    ],
 								)
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_put(started_cluster, maybe_auth, positive, compression):
-												Implement system table blob_storage_log

											
										
										
											2023-11-07 10:03:57 +00:00
+								    # type: (ClickHouseCluster, str, bool, str) -> None
-												Tests decomposition.

											
										
										
											2019-09-19 09:34:33 +00:00
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    bucket = (
 								        started_cluster.minio_bucket
 								        if not maybe_auth
 								        else started_cluster.minio_restricted_bucket
-												Apply black formatter to all *.py files in the repo

											
										
										
											2022-03-22 16:39:58 +00:00
+								    )
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
-												Minor review fixes.

											
										
										
											2019-09-22 10:42:47 +00:00
+								    values = "(1, 2, 3), (3, 2, 1), (78, 43, 45)"
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								    values_csv = "1,2,3\n3,2,1\n78,43,45\n"
 								    filename = "test.csv"
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-04-14 11:21:40 +00:00
+								    put_query = f"""insert into table function s3('http://{started_cluster.minio_ip}:{started_cluster.minio_port}/{bucket}/{filename}',
-												Do not allow SETTINGS after FORMAT for INSERT queries

Parsing SETTINGS after FORMAT, that has been introduced in [1], can
interpret SETTING as some values, which is misleading.

  [1]: https://github.com/ClickHouse/ClickHouse/pull/4174/files#diff-ba7bd0657630b1cd94cf6ed364bd857338096f49f66dc82918438d6745753775R106

Note, that we are touching only INSERT queries, not SELECT, since this
is a backward incompatible change, and in case of modifying SELECT it
can break too much.

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
Fixes: #35100
Fixes: #20343

											
										
										
											2022-04-03 07:52:44 +00:00
+								                    {maybe_auth}'CSV', '{table_format}', '{compression}') settings s3_truncate_on_insert=1 values {values}"""
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
+								    try:
 								        run_query(instance, put_query)
 								    except helpers.client.QueryRuntimeException:
-												AWS S3 SDK integration.

											
										
										
											2019-12-03 16:23:24 +00:00
+								        if positive:
 								            raise
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
+								    else:
 								        assert positive
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								        assert values_csv == get_s3_file_content(started_cluster, bucket, filename)
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_partition_by(started_cluster):
-												Improve

											
										
										
											2024-07-22 15:47:53 +00:00
+								    id = uuid.uuid4()
-												Fixed test.

											
										
										
											2021-07-29 01:46:41 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
-												Test.

											
										
										
											2021-05-27 06:14:12 +00:00
+								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
-												Validate uft8 in partition key from PARTITION BY for s3

											
										
										
											2021-08-19 11:05:15 +00:00
+								    partition_by = "column3"
-												Test.

											
										
										
											2021-05-27 06:14:12 +00:00
+								    values = "(1, 2, 3), (3, 2, 1), (78, 43, 45)"
 								    filename = "test_{_partition_id}.csv"
-												Validate uft8 in partition key from PARTITION BY for s3

											
										
										
											2021-08-19 11:05:15 +00:00
+								    put_query = f"""INSERT INTO TABLE FUNCTION
-												Improve

											
										
										
											2024-07-22 15:47:53 +00:00
+								        s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{filename}', 'CSV', '{table_format}')
-												Validate uft8 in partition key from PARTITION BY for s3

											
										
										
											2021-08-19 11:05:15 +00:00
+								        PARTITION BY {partition_by} VALUES {values}"""
-												Test.

											
										
										
											2021-05-27 06:14:12 +00:00
-												Fixes.

											
										
										
											2021-05-31 08:46:28 +00:00
+								    run_query(instance, put_query)
-												Improve

											
										
										
											2024-07-22 15:47:53 +00:00
+								    assert "1,2,3\n" == get_s3_file_content(started_cluster, bucket, f"{id}/test_3.csv")
 								    assert "3,2,1\n" == get_s3_file_content(started_cluster, bucket, f"{id}/test_1.csv")
 								    assert "78,43,45\n" == get_s3_file_content(
 								        started_cluster, bucket, f"{id}/test_45.csv"
 								    )
-												Test.

											
										
										
											2021-05-27 06:14:12 +00:00
-												For storage

											
										
										
											2021-10-26 12:22:13 +00:00
+								    filename = "test2_{_partition_id}.csv"
 								    instance.query(
-												Improve

											
										
										
											2024-07-22 15:47:53 +00:00
+								        f"create table p ({table_format}) engine=S3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{filename}', 'CSV') partition by column3"
-												For storage

											
										
										
											2021-10-26 12:22:13 +00:00
+								    )
 								    instance.query(f"insert into p values {values}")
-												Improve

											
										
										
											2024-07-22 15:47:53 +00:00
+								    assert "1,2,3\n" == get_s3_file_content(
 								        started_cluster, bucket, f"{id}/test2_3.csv"
 								    )
 								    assert "3,2,1\n" == get_s3_file_content(
 								        started_cluster, bucket, f"{id}/test2_1.csv"
 								    )
 								    assert "78,43,45\n" == get_s3_file_content(
 								        started_cluster, bucket, f"{id}/test2_45.csv"
 								    )
 								    instance.query("drop table p")
-												For storage

											
										
										
											2021-10-26 12:22:13 +00:00
-												Test.

											
										
										
											2021-05-27 06:14:12 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_partition_by_string_column(started_cluster):
-												Improve

											
										
										
											2024-07-22 15:47:53 +00:00
+								    id = uuid.uuid4()
-												Validate uft8 in partition key from PARTITION BY for s3

											
										
										
											2021-08-19 11:05:15 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
 								    table_format = "col_num UInt32, col_str String"
 								    partition_by = "col_str"
-												Do not allow slashes in bucket formatted from PARTITION BY

											
										
										
											2021-08-19 11:21:21 +00:00
+								    values = "(1, 'foo/bar'), (3, 'йцук'), (78, '你好')"
-												Validate uft8 in partition key from PARTITION BY for s3

											
										
										
											2021-08-19 11:05:15 +00:00
+								    filename = "test_{_partition_id}.csv"
 								    put_query = f"""INSERT INTO TABLE FUNCTION
-												Improve

											
										
										
											2024-07-22 15:47:53 +00:00
+								        s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{filename}', 'CSV', '{table_format}')
-												Validate uft8 in partition key from PARTITION BY for s3

											
										
										
											2021-08-19 11:05:15 +00:00
+								        PARTITION BY {partition_by} VALUES {values}"""
 								    run_query(instance, put_query)
-												Do not allow slashes in bucket formatted from PARTITION BY

											
										
										
											2021-08-19 11:21:21 +00:00
+								    assert '1,"foo/bar"\n' == get_s3_file_content(
-												Improve

											
										
										
											2024-07-22 15:47:53 +00:00
+								        started_cluster, bucket, f"{id}/test_foo/bar.csv"
-												add support for new release branch

Automatic style fix

											
										
										
											2024-07-10 14:19:06 +00:00
+								    )
-												black

											
										
										
											2024-07-23 09:10:47 +00:00
+								    assert '3,"йцук"\n' == get_s3_file_content(
 								        started_cluster, bucket, f"{id}/test_йцук.csv"
 								    )
 								    assert '78,"你好"\n' == get_s3_file_content(
 								        started_cluster, bucket, f"{id}/test_你好.csv"
 								    )
-												Validate uft8 in partition key from PARTITION BY for s3

											
										
										
											2021-08-19 11:05:15 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_partition_by_const_column(started_cluster):
-												Improve

											
										
										
											2024-07-22 15:47:53 +00:00
+								    id = uuid.uuid4()
-												Added integration test.

											
										
										
											2021-07-29 10:56:32 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
 								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
 								    values = "(1, 2, 3), (3, 2, 1), (78, 43, 45)"
-												Validate uft8 in partition key from PARTITION BY for s3

											
										
										
											2021-08-19 11:05:15 +00:00
+								    partition_by = "'88'"
-												Added integration test.

											
										
										
											2021-07-29 10:56:32 +00:00
+								    values_csv = "1,2,3\n3,2,1\n78,43,45\n"
 								    filename = "test_{_partition_id}.csv"
-												Validate uft8 in partition key from PARTITION BY for s3

											
										
										
											2021-08-19 11:05:15 +00:00
+								    put_query = f"""INSERT INTO TABLE FUNCTION
-												Improve

											
										
										
											2024-07-22 15:47:53 +00:00
+								        s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{filename}', 'CSV', '{table_format}')
-												Validate uft8 in partition key from PARTITION BY for s3

											
										
										
											2021-08-19 11:05:15 +00:00
+								        PARTITION BY {partition_by} VALUES {values}"""
-												Added integration test.

											
										
										
											2021-07-29 10:56:32 +00:00
 								    run_query(instance, put_query)
-												Improve

											
										
										
											2024-07-22 15:47:53 +00:00
+								    assert values_csv == get_s3_file_content(
 								        started_cluster, bucket, f"{id}/test_88.csv"
 								    )
-												Added integration test.

											
										
										
											2021-07-29 10:56:32 +00:00
-												Fixed bug with S3 URLs containing `+` symbol, data with such keys could not be read previously.

											
										
										
											2021-05-04 06:25:33 +00:00
+								@pytest.mark.parametrize("special", ["space", "plus"])
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_get_file_with_special(started_cluster, special):
-												Fixed bug with S3 URLs containing `+` symbol, data with such keys could not be read previously.

											
										
										
											2021-05-04 06:25:33 +00:00
+								    symbol = {"space": " ", "plus": "+"}[special]
 								    urlsafe_symbol = {"space": "%20", "plus": "%2B"}[special]
 								    auth = "'minio','minio123',"
-												s3 catch up

											
										
										
											2021-05-12 07:03:53 +00:00
+								    bucket = started_cluster.minio_restricted_bucket
 								    instance = started_cluster.instances["dummy"]
-												Fixed bug with S3 URLs containing `+` symbol, data with such keys could not be read previously.

											
										
										
											2021-05-04 06:25:33 +00:00
+								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
 								    values = [
 								        [12549, 2463, 19893],
 								        [64021, 38652, 66703],
 								        [81611, 39650, 83516],
 								        [11079, 59507, 61546],
 								        [51764, 69952, 6876],
 								        [41165, 90293, 29095],
 								        [40167, 78432, 48309],
 								        [81629, 81327, 11855],
 								        [55852, 21643, 98507],
 								        [6738, 54643, 41155],
 								    ]
 								    values_csv = (
 								        "\n".join((",".join(map(str, row)) for row in values)) + "\n"
 								    ).encode()
 								    filename = f"get_file_with_{special}_{symbol}two.csv"
-												s3 catch up

											
										
										
											2021-05-12 07:03:53 +00:00
+								    put_s3_file_content(started_cluster, bucket, filename, values_csv)
-												Fixed bug with S3 URLs containing `+` symbol, data with such keys could not be read previously.

											
										
										
											2021-05-04 06:25:33 +00:00
-												s3 catch up

											
										
										
											2021-05-12 07:03:53 +00:00
+								    get_query = f"SELECT * FROM s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/get_file_with_{special}_{urlsafe_symbol}two.csv', {auth}'CSV', '{table_format}') FORMAT TSV"
-												Fixed bug with S3 URLs containing `+` symbol, data with such keys could not be read previously.

											
										
										
											2021-05-04 06:25:33 +00:00
+								    assert [
 								        list(map(int, l.split())) for l in run_query(instance, get_query).splitlines()
 								    ] == values
-												s3 catch up

											
										
										
											2021-05-12 07:03:53 +00:00
+								    get_query = f"SELECT * FROM s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/get_file_with_{special}*.csv', {auth}'CSV', '{table_format}') FORMAT TSV"
-												Fixed bug with S3 URLs containing `+` symbol, data with such keys could not be read previously.

											
										
										
											2021-05-04 06:25:33 +00:00
+								    assert [
 								        list(map(int, l.split())) for l in run_query(instance, get_query).splitlines()
 								    ] == values
-												s3 catch up

											
										
										
											2021-05-12 07:03:53 +00:00
+								    get_query = f"SELECT * FROM s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/get_file_with_{special}_{urlsafe_symbol}*.csv', {auth}'CSV', '{table_format}') FORMAT TSV"
-												Fixed bug with S3 URLs containing `+` symbol, data with such keys could not be read previously.

											
										
										
											2021-05-04 06:25:33 +00:00
+								    assert [
 								        list(map(int, l.split())) for l in run_query(instance, get_query).splitlines()
 								    ] == values
 								@pytest.mark.parametrize("special", ["space", "plus", "plus2"])
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_get_path_with_special(started_cluster, special):
-												Fixed bug with S3 URLs containing `+` symbol, data with such keys could not be read previously.

											
										
										
											2021-05-04 06:25:33 +00:00
+								    symbol = {"space": "%20", "plus": "%2B", "plus2": "%2B"}[special]
 								    safe_symbol = {"space": "%20", "plus": "+", "plus2": "%2B"}[special]
 								    auth = "'minio','minio123',"
 								    table_format = "column1 String"
-												s3 catch up

											
										
										
											2021-05-12 07:03:53 +00:00
+								    instance = started_cluster.instances["dummy"]
-												Fixed bug with S3 URLs containing `+` symbol, data with such keys could not be read previously.

											
										
										
											2021-05-04 06:25:33 +00:00
+								    get_query = f"SELECT * FROM s3('http://resolver:8082/get-my-path/{safe_symbol}.csv', {auth}'CSV', '{table_format}') FORMAT TSV"
 								    assert run_query(instance, get_query).splitlines() == [f"/{symbol}.csv"]
-												Abort multipart upload if no data was written to WriteBufferFromS3. (#16840)


											
										
										
											2020-11-11 12:15:16 +00:00
+								# Test put no data to S3.
-												more

											
										
										
											2021-04-13 10:52:22 +00:00
+								@pytest.mark.parametrize("auth", [pytest.param("'minio','minio123',", id="minio")])
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_empty_put(started_cluster, auth):
-												more

											
										
										
											2021-06-02 15:08:16 +00:00
+								    # type: (ClickHouseCluster, str) -> None
-												Improve

											
										
										
											2024-07-22 15:47:53 +00:00
+								    id = uuid.uuid4()
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
-												Abort multipart upload if no data was written to WriteBufferFromS3. (#16840)


											
										
										
											2020-11-11 12:15:16 +00:00
+								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
-												better test_storage_s3, test_storage_s3_get_unstable still not OK

											
										
										
											2021-07-15 12:49:25 +00:00
+								    drop_empty_table_query = "DROP TABLE IF EXISTS empty_table"
-												Improve

											
										
										
											2024-07-22 15:47:53 +00:00
+								    create_empty_table_query = (
 								        f"CREATE TABLE empty_table ({table_format}) ENGINE = Null()"
-												Abort multipart upload if no data was written to WriteBufferFromS3. (#16840)


											
										
										
											2020-11-11 12:15:16 +00:00
+								    )
-												better test_storage_s3, test_storage_s3_get_unstable still not OK

											
										
										
											2021-07-15 12:49:25 +00:00
+								    run_query(instance, drop_empty_table_query)
-												Abort multipart upload if no data was written to WriteBufferFromS3. (#16840)


											
										
										
											2020-11-11 12:15:16 +00:00
+								    run_query(instance, create_empty_table_query)
 								    filename = "empty_put_test.csv"
-												Improve

											
										
										
											2024-07-22 15:47:53 +00:00
+								    put_query = f"""insert into table function
 								        s3('http://{started_cluster.minio_ip}:{MINIO_INTERNAL_PORT}/{bucket}/{id}/{filename}', {auth} 'CSV', '{table_format}')
 								        select * from empty_table"""
-												Abort multipart upload if no data was written to WriteBufferFromS3. (#16840)


											
										
										
											2020-11-11 12:15:16 +00:00
 								    run_query(instance, put_query)
-												fix test

											
										
										
											2022-06-15 15:02:02 +00:00
+								    assert (
-												Abort multipart upload if no data was written to WriteBufferFromS3. (#16840)


											
										
										
											2020-11-11 12:15:16 +00:00
+								        run_query(
 								            instance,
-												Improve

											
										
										
											2024-07-22 15:47:53 +00:00
+								            f"""select count(*) from
 								            s3('http://{started_cluster.minio_ip}:{MINIO_INTERNAL_PORT}/{bucket}/{id}/{filename}', {auth} 'CSV', '{table_format}')""",
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-04-14 11:21:40 +00:00
+								        )
-												fix test

											
										
										
											2022-06-15 15:02:02 +00:00
+								        == "0\n"
 								    )
-												Abort multipart upload if no data was written to WriteBufferFromS3. (#16840)


											
										
										
											2020-11-11 12:15:16 +00:00
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								# Test put values in CSV format.
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
+								@pytest.mark.parametrize(
 								    "maybe_auth,positive",
 								    [
-												wip

											
										
										
											2021-04-12 07:03:12 +00:00
+								        pytest.param("", True, id="positive"),
 								        pytest.param("'minio','minio123',", True, id="auth_positive"),
 								        pytest.param("'wrongid','wrongkey',", False, id="negative"),
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
+								    ],
 								)
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_put_csv(started_cluster, maybe_auth, positive):
-												more

											
										
										
											2021-06-02 15:08:16 +00:00
+								    # type: (ClickHouseCluster, bool, str) -> None
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    bucket = (
 								        started_cluster.minio_bucket
 								        if not maybe_auth
 								        else started_cluster.minio_restricted_bucket
-												Apply black formatter to all *.py files in the repo

											
										
										
											2022-03-22 16:39:58 +00:00
+								    )
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
 								    filename = "test.csv"
-												Do not allow SETTINGS after FORMAT for INSERT queries

Parsing SETTINGS after FORMAT, that has been introduced in [1], can
interpret SETTING as some values, which is misleading.

  [1]: https://github.com/ClickHouse/ClickHouse/pull/4174/files#diff-ba7bd0657630b1cd94cf6ed364bd857338096f49f66dc82918438d6745753775R106

Note, that we are touching only INSERT queries, not SELECT, since this
is a backward incompatible change, and in case of modifying SELECT it
can break too much.

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
Fixes: #35100
Fixes: #20343

											
										
										
											2022-04-03 07:52:44 +00:00
+								    put_query = "insert into table function s3('http://{}:{}/{}/{}', {}'CSV', '{}') settings s3_truncate_on_insert=1 format CSV".format(
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-04-14 11:21:40 +00:00
+								        started_cluster.minio_ip,
 								        MINIO_INTERNAL_PORT,
 								        bucket,
 								        filename,
 								        maybe_auth,
 								        table_format,
 								    )
-												Minor review fixes.

											
										
										
											2019-09-22 10:42:47 +00:00
+								    csv_data = "8,9,16\n11,18,13\n22,14,2\n"
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
+								    try:
 								        run_query(instance, put_query, stdin=csv_data)
 								    except helpers.client.QueryRuntimeException:
-												AWS S3 SDK integration.

											
										
										
											2019-12-03 16:23:24 +00:00
+								        if positive:
 								            raise
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
+								    else:
 								        assert positive
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								        assert csv_data == get_s3_file_content(started_cluster, bucket, filename)
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
 								# Test put and get with S3 server redirect.
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_put_get_with_redirect(started_cluster):
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								    # type: (ClickHouseCluster) -> None
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
 								    values = "(1, 1, 1), (1, 1, 1), (11, 11, 11)"
 								    values_csv = "1,1,1\n1,1,1\n11,11,11\n"
 								    filename = "test.csv"
-												Do not allow SETTINGS after FORMAT for INSERT queries

Parsing SETTINGS after FORMAT, that has been introduced in [1], can
interpret SETTING as some values, which is misleading.

  [1]: https://github.com/ClickHouse/ClickHouse/pull/4174/files#diff-ba7bd0657630b1cd94cf6ed364bd857338096f49f66dc82918438d6745753775R106

Note, that we are touching only INSERT queries, not SELECT, since this
is a backward incompatible change, and in case of modifying SELECT it
can break too much.

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
Fixes: #35100
Fixes: #20343

											
										
										
											2022-04-03 07:52:44 +00:00
+								    query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') settings s3_truncate_on_insert=1 values {}".format(
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								        started_cluster.minio_redirect_host,
 								        started_cluster.minio_redirect_port,
 								        bucket,
 								        filename,
 								        table_format,
 								        values,
 								    )
-												Tests decomposition.

											
										
										
											2019-09-19 09:34:33 +00:00
+								    run_query(instance, query)
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    assert values_csv == get_s3_file_content(started_cluster, bucket, filename)
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
 								    query = "select *, column1*column2*column3 from s3('http://{}:{}/{}/{}', 'CSV', '{}')".format(
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								        started_cluster.minio_redirect_host,
 								        started_cluster.minio_redirect_port,
 								        bucket,
 								        filename,
 								        table_format,
 								    )
-												Tests decomposition.

											
										
										
											2019-09-19 09:34:33 +00:00
+								    stdout = run_query(instance, query)
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
-												Tests decomposition.

											
										
										
											2019-09-19 09:34:33 +00:00
+								    assert list(map(str.split, stdout.splitlines())) == [
-												Minor review fixes.

											
										
										
											2019-09-22 10:42:47 +00:00
+								        ["1", "1", "1", "1"],
 								        ["1", "1", "1", "1"],
 								        ["11", "11", "11", "1331"],
-												Attempt to make integration tests.

											
										
										
											2019-06-26 00:41:14 +00:00
+								    ]
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
-												Use only 's3_max_redirect' in params instead of all settings

											
										
										
											2020-11-23 11:02:17 +00:00
+								# Test put with restricted S3 server redirect.
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_put_with_zero_redirect(started_cluster):
-												Update test.py
											
										
										
											2021-06-21 16:07:17 +00:00
+								    # type: (ClickHouseCluster) -> None
-												Add 's3_max_redirects' test

											
										
										
											2020-11-20 08:18:44 +00:00
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    bucket = started_cluster.minio_bucket
-												Update test.py
											
										
										
											2021-06-21 16:07:17 +00:00
+								    instance = started_cluster.instances["s3_max_redirects"]  # type: ClickHouseInstance
 								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
-												Add 's3_max_redirects' test

											
										
										
											2020-11-20 08:18:44 +00:00
+								    values = "(1, 1, 1), (1, 1, 1), (11, 11, 11)"
 								    filename = "test.csv"
-												Update test.py
											
										
										
											2021-06-21 16:07:17 +00:00
+								    # Should work without redirect
-												Do not allow SETTINGS after FORMAT for INSERT queries

Parsing SETTINGS after FORMAT, that has been introduced in [1], can
interpret SETTING as some values, which is misleading.

  [1]: https://github.com/ClickHouse/ClickHouse/pull/4174/files#diff-ba7bd0657630b1cd94cf6ed364bd857338096f49f66dc82918438d6745753775R106

Note, that we are touching only INSERT queries, not SELECT, since this
is a backward incompatible change, and in case of modifying SELECT it
can break too much.

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
Fixes: #35100
Fixes: #20343

											
										
										
											2022-04-03 07:52:44 +00:00
+								    query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') settings s3_truncate_on_insert=1 values {}".format(
-												Update test.py
											
										
										
											2021-06-21 16:07:17 +00:00
+								        started_cluster.minio_ip,
 								        MINIO_INTERNAL_PORT,
 								        bucket,
 								        filename,
 								        table_format,
 								        values,
 								    )
-												Add 's3_max_redirects' test

											
										
										
											2020-11-20 08:18:44 +00:00
+								    run_query(instance, query)
-												Update test.py
											
										
										
											2021-06-21 16:07:17 +00:00
+								    # Should not work with redirect
-												Do not allow SETTINGS after FORMAT for INSERT queries

Parsing SETTINGS after FORMAT, that has been introduced in [1], can
interpret SETTING as some values, which is misleading.

  [1]: https://github.com/ClickHouse/ClickHouse/pull/4174/files#diff-ba7bd0657630b1cd94cf6ed364bd857338096f49f66dc82918438d6745753775R106

Note, that we are touching only INSERT queries, not SELECT, since this
is a backward incompatible change, and in case of modifying SELECT it
can break too much.

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
Fixes: #35100
Fixes: #20343

											
										
										
											2022-04-03 07:52:44 +00:00
+								    query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') settings s3_truncate_on_insert=1 values {}".format(
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								        started_cluster.minio_redirect_host,
 								        started_cluster.minio_redirect_port,
 								        bucket,
 								        filename,
 								        table_format,
 								        values,
 								    )
-												Update test.py
											
										
										
											2021-06-21 16:07:17 +00:00
+								    exception_raised = False
-												Add 's3_max_redirects' test

											
										
										
											2020-11-20 08:18:44 +00:00
+								    try:
 								        run_query(instance, query)
-												Update test.py
											
										
										
											2021-06-21 16:07:17 +00:00
+								    except Exception as e:
 								        assert str(e).find("Too many redirects while trying to access") != -1
 								        exception_raised = True
-												Add 's3_max_redirects' test

											
										
										
											2020-11-20 08:18:44 +00:00
+								    finally:
 								        assert exception_raised
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_put_get_with_globs(started_cluster):
-												Added some tests.

											
										
										
											2020-01-27 21:44:18 +00:00
+								    # type: (ClickHouseCluster) -> None
-												better test_storage_s3, test_storage_s3_get_unstable still not OK

											
										
										
											2021-07-15 12:49:25 +00:00
+								    unique_prefix = random.randint(1, 10000)
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
-												Added some tests.

											
										
										
											2020-01-27 21:44:18 +00:00
+								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
-												Added tests for `_file` and `_path` in S3 storage.

											
										
										
											2020-01-27 22:09:21 +00:00
+								    max_path = ""
-												Added some tests.

											
										
										
											2020-01-27 21:44:18 +00:00
+								    for i in range(10):
 								        for j in range(10):
-												better test_storage_s3, test_storage_s3_get_unstable still not OK

											
										
										
											2021-07-15 12:49:25 +00:00
+								            path = "{}/{}_{}/{}.csv".format(
 								                unique_prefix, i, random.choice(["a", "b", "c", "d"]), j
 								            )
-												Added tests for `_file` and `_path` in S3 storage.

											
										
										
											2020-01-27 22:09:21 +00:00
+								            max_path = max(path, max_path)
-												Add default credentials and custom headers for s3 table functions.

											
										
										
											2020-06-01 17:16:09 +00:00
+								            values = "({},{},{})".format(i, j, i + j)
-												Added some tests.

											
										
										
											2020-01-27 21:44:18 +00:00
+								            query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format(
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-04-14 11:21:40 +00:00
+								                started_cluster.minio_ip,
 								                MINIO_INTERNAL_PORT,
 								                bucket,
 								                path,
 								                table_format,
 								                values,
 								            )
-												Added some tests.

											
										
										
											2020-01-27 21:44:18 +00:00
+								            run_query(instance, query)
-												better test_storage_s3, test_storage_s3_get_unstable still not OK

											
										
										
											2021-07-15 12:49:25 +00:00
+								    query = "select sum(column1), sum(column2), sum(column3), min(_file), max(_path) from s3('http://{}:{}/{}/{}/*_{{a,b,c,d}}/%3f.csv', 'CSV', '{}')".format(
 								        started_cluster.minio_redirect_host,
 								        started_cluster.minio_redirect_port,
 								        bucket,
 								        unique_prefix,
 								        table_format,
 								    )
-												Add default credentials and custom headers for s3 table functions.

											
										
										
											2020-06-01 17:16:09 +00:00
+								    assert run_query(instance, query).splitlines() == [
 								        "450\t450\t900\t0.csv\t{bucket}/{max_path}".format(
 								            bucket=bucket, max_path=max_path
-												Apply black formatter to all *.py files in the repo

											
										
										
											2022-03-22 16:39:58 +00:00
+								        )
-												Add default credentials and custom headers for s3 table functions.

											
										
										
											2020-06-01 17:16:09 +00:00
+								    ]
-												Added some tests.

											
										
										
											2020-01-27 21:44:18 +00:00
-												Fix test_storage_s3/test_put_get_with_globs (cleanup after test)

											
										
										
											2021-08-30 10:31:46 +00:00
+								    minio = started_cluster.minio_client
 								    for obj in list(
 								        minio.list_objects(
 								            started_cluster.minio_bucket,
 								            prefix="{}/".format(unique_prefix),
 								            recursive=True,
-												Apply black formatter to all *.py files in the repo

											
										
										
											2022-03-22 16:39:58 +00:00
+								        )
-												Fix test_storage_s3/test_put_get_with_globs (cleanup after test)

											
										
										
											2021-08-30 10:31:46 +00:00
+								    ):
 								        minio.remove_object(started_cluster.minio_bucket, obj.object_name)
-												Added some tests.

											
										
										
											2020-01-27 21:44:18 +00:00
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								# Test multipart put.
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
+								@pytest.mark.parametrize(
 								    "maybe_auth,positive",
 								    [
-												wip

											
										
										
											2021-04-12 07:03:12 +00:00
+								        pytest.param("", True, id="positive"),
 								        pytest.param("'wrongid','wrongkey'", False, id="negative"),
-												AWS S3 SDK integration.

											
										
										
											2019-12-03 16:23:24 +00:00
+								        # ("'minio','minio123',",True), Redirect with credentials not working with nginx.
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
+								    ],
 								)
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_multipart(started_cluster, maybe_auth, positive):
-												Implement system table blob_storage_log

											
										
										
											2023-11-07 10:03:57 +00:00
+								    # type: (ClickHouseCluster, str, bool) -> None
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
-												fix

											
										
										
											2024-07-22 16:12:11 +00:00
+								    id = uuid.uuid4()
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    bucket = (
 								        started_cluster.minio_bucket
 								        if not maybe_auth
 								        else started_cluster.minio_restricted_bucket
-												Apply black formatter to all *.py files in the repo

											
										
										
											2022-03-22 16:39:58 +00:00
+								    )
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
+								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
 								    # Minimum size of part is 5 Mb for Minio.
 								    # See: https://github.com/minio/minio/blob/master/docs/minio-limits.md
-												Ensure multipart upload works in S3 storage tests.

											
										
										
											2019-11-21 13:13:38 +00:00
+								    min_part_size_bytes = 5 * 1024 * 1024
 								    csv_size_bytes = int(min_part_size_bytes * 1.5)  # To have 2 parts.
 								    one_line_length = 6  # 3 digits, 2 commas, 1 line separator.
-												Add tests for S3 multithreaded download

											
										
										
											2022-03-24 09:30:06 +00:00
+								    total_rows = csv_size_bytes // one_line_length
-												Ensure multipart upload works in S3 storage tests.

											
										
										
											2019-11-21 13:13:38 +00:00
+								    # Generate data having size more than one part
-												Add tests for S3 multithreaded download

											
										
										
											2022-03-24 09:30:06 +00:00
+								    int_data = [[1, 2, 3] for i in range(total_rows)]
-												Ensure multipart upload works in S3 storage tests.

											
										
										
											2019-11-21 13:13:38 +00:00
+								    csv_data = "".join(["{},{},{}\n".format(x, y, z) for x, y, z in int_data])
-												Use Minio for S3 Storage integration tests.

											
										
										
											2019-11-20 11:56:38 +00:00
-												Ensure multipart upload works in S3 storage tests.

											
										
										
											2019-11-21 13:13:38 +00:00
+								    assert len(csv_data) > min_part_size_bytes
-												fix

											
										
										
											2024-07-22 16:12:11 +00:00
+								    filename = f"{id}/test_multipart.csv"
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
+								    put_query = "insert into table function s3('http://{}:{}/{}/{}', {}'CSV', '{}') format CSV".format(
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								        started_cluster.minio_redirect_host,
 								        started_cluster.minio_redirect_port,
 								        bucket,
 								        filename,
 								        maybe_auth,
 								        table_format,
 								    )
-												Implement system table blob_storage_log

											
										
										
											2023-11-07 10:03:57 +00:00
+								    put_query_id = uuid.uuid4().hex
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
+								    try:
-												Adaptive choose of single/multi part upload in WriteBufferFromS3.

											
										
										
											2020-12-09 14:09:04 +00:00
+								        run_query(
 								            instance,
 								            put_query,
 								            stdin=csv_data,
 								            settings={
 								                "s3_min_upload_part_size": min_part_size_bytes,
 								                "s3_max_single_part_upload_size": 0,
-												Apply black formatter to all *.py files in the repo

											
										
										
											2022-03-22 16:39:58 +00:00
+								            },
-												Implement system table blob_storage_log

											
										
										
											2023-11-07 10:03:57 +00:00
+								            query_id=put_query_id,
-												Adaptive choose of single/multi part upload in WriteBufferFromS3.

											
										
										
											2020-12-09 14:09:04 +00:00
+								        )
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
+								    except helpers.client.QueryRuntimeException:
-												AWS S3 SDK integration.

											
										
										
											2019-12-03 16:23:24 +00:00
+								        if positive:
 								            raise
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
+								    else:
 								        assert positive
-												S3 HTTPS integration test.

											
										
										
											2020-07-10 19:42:18 +00:00
+								        # Use proxy access logs to count number of parts uploaded to Minio.
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								        proxy_logs = started_cluster.get_container_logs("proxy1")  # type: str
-												S3 HTTPS integration test.

											
										
										
											2020-07-10 19:42:18 +00:00
+								        assert proxy_logs.count("PUT /{}/{}".format(bucket, filename)) >= 2
-												Fixed tests and logic of authorization in S3.

											
										
										
											2019-12-01 11:24:55 +00:00
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								        assert csv_data == get_s3_file_content(started_cluster, bucket, filename)
-												Added integration test for storage_s3

											
										
										
											2019-11-06 17:06:50 +00:00
-												Add tests for S3 multithreaded download

											
										
										
											2022-03-24 09:30:06 +00:00
+								    # select uploaded data from many threads
 								    select_query = (
 								        "select sum(column1), sum(column2), sum(column3) "
 								        "from s3('http://{host}:{port}/{bucket}/{filename}', {auth}'CSV', '{table_format}')".format(
 								            host=started_cluster.minio_redirect_host,
 								            port=started_cluster.minio_redirect_port,
 								            bucket=bucket,
 								            filename=filename,
 								            auth=maybe_auth,
 								            table_format=table_format,
 								        )
 								    )
 								    try:
 								        select_result = run_query(
 								            instance,
 								            select_query,
 								            settings={
 								                "max_download_threads": random.randint(4, 16),
 								                "max_download_buffer_size": 1024 * 1024,
 								            },
 								        )
 								    except helpers.client.QueryRuntimeException:
 								        if positive:
 								            raise
 								    else:
 								        assert positive
 								        assert (
 								            select_result
 								            == "\t".join(map(str, [total_rows, total_rows * 2, total_rows * 3])) + "\n"
 								        )
-												Implement system table blob_storage_log

											
										
										
											2023-11-07 10:03:57 +00:00
+								    if positive:
 								        instance.query("SYSTEM FLUSH LOGS")
 								        blob_storage_log = instance.query(f"SELECT * FROM system.blob_storage_log")
 								        result = instance.query(
 								            f"""SELECT
 								                countIf(event_type == 'MultiPartUploadCreate'),
 								                countIf(event_type == 'MultiPartUploadWrite'),
 								                countIf(event_type == 'MultiPartUploadComplete'),
 								                count()
 								            FROM system.blob_storage_log WHERE query_id = '{put_query_id}'"""
 								        )
 								        r = result.strip().split("\t")
 								        assert int(r[0]) == 1, blob_storage_log
 								        assert int(r[1]) >= 1, blob_storage_log
 								        assert int(r[2]) == 1, blob_storage_log
 								        assert int(r[0]) + int(r[1]) + int(r[2]) == int(r[3]), blob_storage_log
-												Fix test

											
										
										
											2019-12-03 17:36:02 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_remote_host_filter(started_cluster):
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    instance = started_cluster.instances["restricted_dummy"]
-												Added integration test for storage_s3

											
										
										
											2019-11-06 17:06:50 +00:00
+								    format = "column1 UInt32, column2 UInt32, column3 UInt32"
-												AWS SDK integration - fixed test with remote host filter.

											
										
										
											2019-12-09 12:05:16 +00:00
+								    query = "select *, column1*column2*column3 from s3('http://{}:{}/{}/test.csv', 'CSV', '{}')".format(
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								        "invalid_host", MINIO_INTERNAL_PORT, started_cluster.minio_bucket, format
 								    )
-												Fixed tests

											
										
										
											2021-10-29 22:29:36 +00:00
+								    assert "not allowed in configuration file" in instance.query_and_get_error(query)
-												Added integration test for storage_s3

											
										
										
											2019-11-06 17:06:50 +00:00
 								    other_values = "(1, 1, 1), (1, 1, 1), (11, 11, 11)"
-												AWS SDK integration - fixed test with remote host filter.

											
										
										
											2019-12-09 12:05:16 +00:00
+								    query = "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') values {}".format(
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								        "invalid_host",
 								        MINIO_INTERNAL_PORT,
 								        started_cluster.minio_bucket,
 								        format,
 								        other_values,
 								    )
-												Fixed tests

											
										
										
											2021-10-29 22:29:36 +00:00
+								    assert "not allowed in configuration file" in instance.query_and_get_error(query)
-												AWS SDK integration - move s3 storage syntax tests to integration.

											
										
										
											2019-12-10 16:11:13 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_wrong_s3_syntax(started_cluster):
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
-												AWS SDK integration - move s3 storage syntax tests to integration.

											
										
										
											2019-12-10 16:11:13 +00:00
+								    expected_err_msg = "Code: 42"  # NUMBER_OF_ARGUMENTS_DOESNT_MATCH
-												S3 Session Tokens: Added tests

											
										
										
											2023-12-18 14:31:01 +00:00
+								    query = "create table test_table_s3_syntax (id UInt32) ENGINE = S3('', '', '', '', '', '', '')"
-												Fix tests and build

											
										
										
											2022-01-14 13:27:57 +00:00
+								    assert expected_err_msg in instance.query_and_get_error(query)
 								    expected_err_msg = "Code: 36"  # BAD_ARGUMENTS
 								    query = "create table test_table_s3_syntax (id UInt32) ENGINE = S3('')"
-												AWS SDK integration - move s3 storage syntax tests to integration.

											
										
										
											2019-12-10 16:11:13 +00:00
+								    assert expected_err_msg in instance.query_and_get_error(query)
-												Added test for multi-page S3 globbing.

											
										
										
											2020-05-25 09:15:11 +00:00
-												Update test.py
											
										
										
											2020-05-25 21:05:15 +00:00
+								# https://en.wikipedia.org/wiki/One_Thousand_and_One_Nights
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_s3_glob_scheherazade(started_cluster):
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
-												Added test for multi-page S3 globbing.

											
										
										
											2020-05-25 09:15:11 +00:00
+								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
 								    max_path = ""
 								    values = "(1, 1, 1)"
 								    nights_per_job = 1001 // 30
 								    jobs = []
 								    for night in range(0, 1001, nights_per_job):
-												Apply black formatter to all *.py files in the repo

											
										
										
											2022-03-22 16:39:58 +00:00
-												Added test for multi-page S3 globbing.

											
										
										
											2020-05-25 09:15:11 +00:00
+								        def add_tales(start, end):
 								            for i in range(start, end):
 								                path = "night_{}/tale.csv".format(i)
 								                query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format(
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-04-14 11:21:40 +00:00
+								                    started_cluster.minio_ip,
 								                    MINIO_INTERNAL_PORT,
 								                    bucket,
 								                    path,
 								                    table_format,
 								                    values,
 								                )
-												Added test for multi-page S3 globbing.

											
										
										
											2020-05-25 09:15:11 +00:00
+								                run_query(instance, query)
-												Format and optimize imports in integration test files

This PR formats all the `*.py` files found under the `tests/integration`
folder. It also reorders the imports and cleans up a bunch of unused
imports.

The formatting also takes care of other things like wrapping lines and
fixing spaces and indents such that the tests look more readable.

											
										
										
											2020-09-16 04:26:10 +00:00
+								        jobs.append(
 								            threading.Thread(
 								                target=add_tales, args=(night, min(night + nights_per_job, 1001))
-												Apply black formatter to all *.py files in the repo

											
										
										
											2022-03-22 16:39:58 +00:00
+								            )
 								        )
-												Added test for multi-page S3 globbing.

											
										
										
											2020-05-25 09:15:11 +00:00
+								        jobs[-1].start()
 								    for job in jobs:
 								        job.join()
 								    query = "select count(), sum(column1), sum(column2), sum(column3) from s3('http://{}:{}/{}/night_*/tale.csv', 'CSV', '{}')".format(
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								        started_cluster.minio_redirect_host,
 								        started_cluster.minio_redirect_port,
 								        bucket,
 								        table_format,
 								    )
-												Added test for multi-page S3 globbing.

											
										
										
											2020-05-25 09:15:11 +00:00
+								    assert run_query(instance, query).splitlines() == ["1001\t1001\t1001\t1001"]
-												Add default credentials and custom headers for s3 table functions.

											
										
										
											2020-06-01 17:16:09 +00:00
-												fix test_s3_glob_many_objects_under_selection

											
										
										
											2024-04-10 12:02:01 +00:00
+								# a bit simplified version of scheherazade test
 								# checks e.g. `prefix{1,2}/file*.csv`, where there are more than 1000 files under prefix1.
-												added test for selection globs with many files under

											
										
										
											2024-04-09 21:01:01 +00:00
+								def test_s3_glob_many_objects_under_selection(started_cluster):
 								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
 								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
 								    values = "(1, 1, 1)"
 								    jobs = []
-												fix test_s3_glob_many_objects_under_selection

											
										
										
											2024-04-10 12:02:01 +00:00
+								    for thread_num in range(16):
-												added test for selection globs with many files under

											
										
										
											2024-04-09 21:01:01 +00:00
-												fix test_s3_glob_many_objects_under_selection

											
										
										
											2024-04-10 12:02:01 +00:00
+								        def create_files(thread_num):
 								            for f_num in range(thread_num * 63, thread_num * 63 + 63):
 								                path = f"folder1/file{f_num}.csv"
-												fix

											
										
										
											2024-07-22 16:12:11 +00:00
+								                query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') settings s3_truncate_on_insert=1 values {}".format(
-												added test for selection globs with many files under

											
										
										
											2024-04-09 21:01:01 +00:00
+								                    started_cluster.minio_ip,
 								                    MINIO_INTERNAL_PORT,
 								                    bucket,
 								                    path,
 								                    table_format,
 								                    values,
 								                )
 								                run_query(instance, query)
-												fix test_s3_glob_many_objects_under_selection

											
										
										
											2024-04-10 12:02:01 +00:00
+								        jobs.append(threading.Thread(target=create_files, args=(thread_num,)))
-												added test for selection globs with many files under

											
										
										
											2024-04-09 21:01:01 +00:00
+								        jobs[-1].start()
-												fix

											
										
										
											2024-07-22 16:12:11 +00:00
+								    query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') settings s3_truncate_on_insert=1 values {}".format(
-												fix test_s3_glob_many_objects_under_selection

											
										
										
											2024-04-10 12:02:01 +00:00
+								        started_cluster.minio_ip,
 								        MINIO_INTERNAL_PORT,
 								        bucket,
 								        f"folder2/file0.csv",
 								        table_format,
 								        values,
 								    )
 								    run_query(instance, query)
-												added test for selection globs with many files under

											
										
										
											2024-04-09 21:01:01 +00:00
+								    for job in jobs:
 								        job.join()
 								    query = "select count(), sum(column1), sum(column2), sum(column3) from s3('http://{}:{}/{}/folder{{1,2}}/file*.csv', 'CSV', '{}')".format(
 								        started_cluster.minio_redirect_host,
 								        started_cluster.minio_redirect_port,
 								        bucket,
 								        table_format,
 								    )
-												fix test_s3_glob_many_objects_under_selection

											
										
										
											2024-04-10 12:02:01 +00:00
+								    assert run_query(instance, query).splitlines() == ["1009\t1009\t1009\t1009"]
-												added test for selection globs with many files under

											
										
										
											2024-04-09 21:01:01 +00:00
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-04-27 15:34:33 +00:00
+								def run_s3_mocks(started_cluster):
-												Move common code to helpers/mock_servers.py

											
										
										
											2023-01-02 12:51:17 +00:00
+								    script_dir = os.path.join(os.path.dirname(__file__), "s3_mocks")
 								    start_mock_servers(
 								        started_cluster,
 								        script_dir,
 								        [
 								            ("mock_s3.py", "resolver", "8080"),
 								            ("unstable_server.py", "resolver", "8081"),
 								            ("echo.py", "resolver", "8082"),
-												Fix crash when `ListObjects` request fails (#45371)


											
										
										
											2023-01-20 19:10:23 +00:00
+								            ("no_list_objects.py", "resolver", "8083"),
-												Move common code to helpers/mock_servers.py

											
										
										
											2023-01-02 12:51:17 +00:00
+								        ],
-												Added tests.

											
										
										
											2021-04-12 08:55:54 +00:00
+								    )
-												Add default credentials and custom headers for s3 table functions.

											
										
										
											2020-06-01 17:16:09 +00:00
-												improvements

											
										
										
											2022-07-07 20:19:15 +00:00
+								def replace_config(path, old, new):
 								    config = open(path, "r")
-												recreate S3 client if credentials changed

											
										
										
											2021-03-04 15:56:55 +00:00
+								    config_lines = config.readlines()
 								    config.close()
 								    config_lines = [line.replace(old, new) for line in config_lines]
-												improvements

											
										
										
											2022-07-07 20:19:15 +00:00
+								    config = open(path, "w")
-												recreate S3 client if credentials changed

											
										
										
											2021-03-04 15:56:55 +00:00
+								    config.writelines(config_lines)
 								    config.close()
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_custom_auth_headers(started_cluster):
-												improvements

											
										
										
											2022-07-07 20:19:15 +00:00
+								    config_path = os.path.join(
 								        SCRIPT_DIR,
 								        "./{}/dummy/configs/config.d/defaultS3.xml".format(
 								            started_cluster.instances_dir_name
 								        ),
 								    )
-												Add default credentials and custom headers for s3 table functions.

											
										
										
											2020-06-01 17:16:09 +00:00
+								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
 								    filename = "test.csv"
 								    get_query = "select * from s3('http://resolver:8080/{bucket}/{file}', 'CSV', '{table_format}')".format(
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								        bucket=started_cluster.minio_restricted_bucket,
-												Add default credentials and custom headers for s3 table functions.

											
										
										
											2020-06-01 17:16:09 +00:00
+								        file=filename,
 								        table_format=table_format,
 								    )
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
-												Add default credentials and custom headers for s3 table functions.

											
										
										
											2020-06-01 17:16:09 +00:00
+								    result = run_query(instance, get_query)
 								    assert result == "1\t2\t3\n"
-												throw exception on redirect limit in S3 request

											
										
										
											2020-07-07 13:20:48 +00:00
-												better test_storage_s3, test_storage_s3_get_unstable still not OK

											
										
										
											2021-07-15 12:49:25 +00:00
+								    instance.query("DROP TABLE IF EXISTS test")
-												recreate S3 client if credentials changed

											
										
										
											2021-03-04 15:56:55 +00:00
+								    instance.query(
 								        "CREATE TABLE test ({table_format}) ENGINE = S3('http://resolver:8080/{bucket}/{file}', 'CSV')".format(
-												fix

											
										
										
											2021-03-26 18:46:42 +00:00
+								            bucket=started_cluster.minio_restricted_bucket,
-												recreate S3 client if credentials changed

											
										
										
											2021-03-04 15:56:55 +00:00
+								            file=filename,
 								            table_format=table_format,
 								        )
-												Apply black formatter to all *.py files in the repo

											
										
										
											2022-03-22 16:39:58 +00:00
+								    )
-												recreate S3 client if credentials changed

											
										
										
											2021-03-04 15:56:55 +00:00
+								    assert run_query(instance, "SELECT * FROM test") == "1\t2\t3\n"
 								    replace_config(
-												improvements

											
										
										
											2022-07-07 20:19:15 +00:00
+								        config_path,
-												recreate S3 client if credentials changed

											
										
										
											2021-03-04 15:56:55 +00:00
+								        "<header>Authorization: Bearer TOKEN",
 								        "<header>Authorization: Bearer INVALID_TOKEN",
 								    )
 								    instance.query("SYSTEM RELOAD CONFIG")
 								    ret, err = instance.query_and_get_answer_with_error("SELECT * FROM test")
 								    assert ret == "" and err != ""
 								    replace_config(
-												improvements

											
										
										
											2022-07-07 20:19:15 +00:00
+								        config_path,
-												recreate S3 client if credentials changed

											
										
										
											2021-03-04 15:56:55 +00:00
+								        "<header>Authorization: Bearer INVALID_TOKEN",
 								        "<header>Authorization: Bearer TOKEN",
 								    )
 								    instance.query("SYSTEM RELOAD CONFIG")
 								    assert run_query(instance, "SELECT * FROM test") == "1\t2\t3\n"
-												better test_storage_s3, test_storage_s3_get_unstable still not OK

											
										
										
											2021-07-15 12:49:25 +00:00
+								    instance.query("DROP TABLE test")
-												recreate S3 client if credentials changed

											
										
										
											2021-03-04 15:56:55 +00:00
-												throw exception on redirect limit in S3 request

											
										
										
											2020-07-07 13:20:48 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_custom_auth_headers_exclusion(started_cluster):
-												Added prefix-based S3 endpoint settings.

											
										
										
											2021-01-07 03:42:39 +00:00
+								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
 								    filename = "test.csv"
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    get_query = f"SELECT * FROM s3('http://resolver:8080/{started_cluster.minio_restricted_bucket}/restricteddirectory/{filename}', 'CSV', '{table_format}')"
-												Added prefix-based S3 endpoint settings.

											
										
										
											2021-01-07 03:42:39 +00:00
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
-												Added prefix-based S3 endpoint settings.

											
										
										
											2021-01-07 03:42:39 +00:00
+								    with pytest.raises(helpers.client.QueryRuntimeException) as ei:
 								        result = run_query(instance, get_query)
 								        print(result)
 								    assert ei.value.returncode == 243
-												fix tests

											
										
										
											2022-11-29 17:33:35 +00:00
+								    assert "HTTP response code: 403" in ei.value.stderr
-												Added prefix-based S3 endpoint settings.

											
										
										
											2021-01-07 03:42:39 +00:00
-												finally

											
										
										
											2021-04-29 11:57:48 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_infinite_redirect(started_cluster):
-												finally

											
										
										
											2021-04-29 11:57:48 +00:00
+								    bucket = "redirected"
 								    table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
 								    filename = "test.csv"
 								    get_query = f"select * from s3('http://resolver:{started_cluster.minio_redirect_port}/{bucket}/{filename}', 'CSV', '{table_format}')"
 								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
 								    exception_raised = False
 								    try:
 								        run_query(instance, get_query)
 								    except Exception as e:
 								        assert str(e).find("Too many redirects while trying to access") != -1
 								        exception_raised = True
 								    finally:
 								        assert exception_raised
-												Apply black formatter to all *.py files in the repo

											
										
										
											2022-03-22 16:39:58 +00:00
-												Fixed table function S3 `auto` compression mode.

											
										
										
											2021-01-29 04:54:52 +00:00
+								@pytest.mark.parametrize(
 								    "extension,method",
-												improvements

											
										
										
											2022-07-07 20:19:15 +00:00
+								    [pytest.param("bin", "gzip", id="bin"), pytest.param("gz", "auto", id="gz")],
-												Fixed table function S3 `auto` compression mode.

											
										
										
											2021-01-29 04:54:52 +00:00
+								)
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_storage_s3_get_gzip(started_cluster, extension, method):
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
-												Fixed table function S3 `auto` compression mode.

											
										
										
											2021-01-29 04:54:52 +00:00
+								    filename = f"test_get_gzip.{extension}"
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-03-03 08:56:15 +00:00
+								    name = f"test_get_gzip_{extension}"
-												Added test for GZIP in S3 storage.

											
										
										
											2020-09-28 23:30:41 +00:00
+								    data = [
 								        "Sophia Intrieri,55",
 								        "Jack Taylor,71",
 								        "Christopher Silva,66",
 								        "Clifton Purser,35",
 								        "Richard Aceuedo,43",
 								        "Lisa Hensley,31",
 								        "Alice Wehrley,1",
 								        "Mary Farmer,47",
 								        "Samara Ramirez,19",
 								        "Shirley Lloyd,51",
 								        "Santos Cowger,0",
 								        "Richard Mundt,88",
 								        "Jerry Gonzalez,15",
 								        "Angela James,10",
 								        "Norman Ortega,33",
 								        "",
 								    ]
-												better test_storage_s3, test_storage_s3_get_unstable still not OK

											
										
										
											2021-07-15 12:49:25 +00:00
+								    run_query(instance, f"DROP TABLE IF EXISTS {name}")
-												Convert to python3 (#15007)


											
										
										
											2020-10-02 16:54:07 +00:00
+								    buf = io.BytesIO()
-												Added test for GZIP in S3 storage.

											
										
										
											2020-09-28 23:30:41 +00:00
+								    compressed = gzip.GzipFile(fileobj=buf, mode="wb")
-												Convert to python3 (#15007)


											
										
										
											2020-10-02 16:54:07 +00:00
+								    compressed.write(("\n".join(data)).encode())
-												Added test for GZIP in S3 storage.

											
										
										
											2020-09-28 23:30:41 +00:00
+								    compressed.close()
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    put_s3_file_content(started_cluster, bucket, filename, buf.getvalue())
-												Added test for GZIP in S3 storage.

											
										
										
											2020-09-28 23:30:41 +00:00
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-03-03 08:56:15 +00:00
+								    run_query(
 								        instance,
 								        f"""CREATE TABLE {name} (name String, id UInt32) ENGINE = S3(
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-04-14 11:21:40 +00:00
+								                                'http://{started_cluster.minio_ip}:{MINIO_INTERNAL_PORT}/{bucket}/{filename}',
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-03-03 08:56:15 +00:00
+								                                'CSV',
 								                                '{method}')""",
 								    )
-												Added test for GZIP in S3 storage.

											
										
										
											2020-09-28 23:30:41 +00:00
-												better test_storage_s3, test_storage_s3_get_unstable still not OK

											
										
										
											2021-07-15 12:49:25 +00:00
+								    run_query(instance, f"SELECT sum(id) FROM {name}").splitlines() == ["565"]
 								    run_query(instance, f"DROP TABLE {name}")
-												Added test for GZIP in S3 storage.

											
										
										
											2020-09-28 23:30:41 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_storage_s3_get_unstable(started_cluster):
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-04-27 15:34:33 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
-												Improved tests.

											
										
										
											2021-04-12 21:38:45 +00:00
+								    table_format = "column1 Int64, column2 Int64, column3 Int64, column4 Int64"
-												Improve

											
										
										
											2024-07-22 15:47:53 +00:00
+								    get_query = f"SELECT count(), sum(column3), sum(column4) FROM s3('http://resolver:8081/{started_cluster.minio_bucket}/test.csv', 'CSV', '{table_format}') SETTINGS s3_max_single_read_retries=30 FORMAT CSV"
-												Added tests.

											
										
										
											2021-04-12 08:55:54 +00:00
+								    result = run_query(instance, get_query)
-												Improved `test_storage_s3_get_unstable`.

											
										
										
											2021-05-08 21:55:24 +00:00
+								    assert result.splitlines() == ["500001,500000,0"]
-												Added tests.

											
										
										
											2021-04-12 08:55:54 +00:00
-												Revert "Revert "s3 adaptive timeouts""

											
										
										
											2023-11-20 13:53:22 +00:00
+								def test_storage_s3_get_slow(started_cluster):
 								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
 								    table_format = "column1 Int64, column2 Int64, column3 Int64, column4 Int64"
 								    get_query = f"SELECT count(), sum(column3), sum(column4) FROM s3('http://resolver:8081/{started_cluster.minio_bucket}/slow_send_test.csv', 'CSV', '{table_format}') FORMAT CSV"
 								    result = run_query(instance, get_query)
 								    assert result.splitlines() == ["500001,500000,0"]
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_storage_s3_put_uncompressed(started_cluster):
-												black

											
										
										
											2024-07-23 09:10:47 +00:00
+								    id = uuid.uuid4()
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
-												black

											
										
										
											2024-07-23 09:10:47 +00:00
+								    filename = f"{id}/test_put_uncompressed.bin"
-												Somehow uncompressed PUT works and gzipped PUT doesn't, in S3 storage.

											
										
										
											2020-09-30 12:04:21 +00:00
+								    name = "test_put_uncompressed"
 								    data = [
 								        "'Gloria Thompson',99",
 								        "'Matthew Tang',98",
 								        "'Patsy Anderson',23",
 								        "'Nancy Badillo',93",
 								        "'Roy Hunt',5",
 								        "'Adam Kirk',51",
 								        "'Joshua Douds',28",
 								        "'Jolene Ryan',0",
 								        "'Roxanne Padilla',50",
 								        "'Howard Roberts',41",
 								        "'Ricardo Broughton',13",
 								        "'Roland Speer',83",
 								        "'Cathy Cohan',58",
 								        "'Kathie Dawson',100",
 								        "'Gregg Mcquistion',11",
 								    ]
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-03-03 08:56:15 +00:00
+								    run_query(
 								        instance,
 								        "CREATE TABLE {} (name String, id UInt32) ENGINE = S3('http://{}:{}/{}/{}', 'CSV')".format(
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-04-14 11:21:40 +00:00
+								            name, started_cluster.minio_ip, MINIO_INTERNAL_PORT, bucket, filename
-												Apply black formatter to all *.py files in the repo

											
										
										
											2022-03-22 16:39:58 +00:00
+								        ),
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-04-14 11:21:40 +00:00
+								    )
-												Implement system table blob_storage_log

											
										
										
											2023-11-07 10:03:57 +00:00
+								    insert_query_id = uuid.uuid4().hex
 								    data_sep = "),("
 								    run_query(
 								        instance,
 								        "INSERT INTO {} VALUES ({})".format(name, data_sep.join(data)),
 								        query_id=insert_query_id,
 								    )
-												Fixed bug in GZIP compression in S3 storage.

											
										
										
											2020-09-30 13:09:55 +00:00
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-03-03 08:56:15 +00:00
+								    run_query(instance, "SELECT sum(id) FROM {}".format(name)).splitlines() == ["753"]
-												Somehow uncompressed PUT works and gzipped PUT doesn't, in S3 storage.

											
										
										
											2020-09-30 12:04:21 +00:00
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-03-03 08:56:15 +00:00
+								    uncompressed_content = get_s3_file_content(started_cluster, bucket, filename)
 								    assert sum([int(i.split(",")[1]) for i in uncompressed_content.splitlines()]) == 753
-												Somehow uncompressed PUT works and gzipped PUT doesn't, in S3 storage.

											
										
										
											2020-09-30 12:04:21 +00:00
-												Implement system table blob_storage_log

											
										
										
											2023-11-07 10:03:57 +00:00
+								    instance.query("SYSTEM FLUSH LOGS")
 								    blob_storage_log = instance.query(f"SELECT * FROM system.blob_storage_log")
 								    result = instance.query(
 								        f"""SELECT
 								            countIf(event_type == 'Upload'),
 								            countIf(remote_path == '{filename}'),
 								            countIf(bucket == '{bucket}'),
 								            count()
 								        FROM system.blob_storage_log WHERE query_id = '{insert_query_id}'"""
 								    )
 								    r = result.strip().split("\t")
 								    assert int(r[0]) >= 1, blob_storage_log
 								    assert all(col == r[0] for col in r), blob_storage_log
-												black

											
										
										
											2024-07-23 09:10:47 +00:00
+								    run_query(instance, f"DROP TABLE {name}")
-												Implement system table blob_storage_log

											
										
										
											2023-11-07 10:03:57 +00:00
-												Somehow uncompressed PUT works and gzipped PUT doesn't, in S3 storage.

											
										
										
											2020-09-30 12:04:21 +00:00
-												Fixed table function S3 `auto` compression mode.

											
										
										
											2021-01-29 04:54:52 +00:00
+								@pytest.mark.parametrize(
 								    "extension,method",
-												wip

											
										
										
											2021-04-12 07:03:12 +00:00
+								    [pytest.param("bin", "gzip", id="bin"), pytest.param("gz", "auto", id="gz")],
-												Fixed table function S3 `auto` compression mode.

											
										
										
											2021-01-29 04:54:52 +00:00
+								)
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_storage_s3_put_gzip(started_cluster, extension, method):
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								    id = uuid.uuid4()
-												better

											
										
										
											2021-02-20 14:59:39 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								    filename = f"{id}/test_put_gzip.{extension}"
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-03-03 08:56:15 +00:00
+								    name = f"test_put_gzip_{extension}"
-												Added test for GZIP in S3 storage.

											
										
										
											2020-09-28 23:30:41 +00:00
+								    data = [
 								        "'Joseph Tomlinson',5",
 								        "'Earnest Essary',44",
 								        "'Matha Pannell',24",
 								        "'Michael Shavers',46",
 								        "'Elias Groce',38",
 								        "'Pamela Bramlet',50",
 								        "'Lewis Harrell',49",
 								        "'Tamara Fyall',58",
 								        "'George Dixon',38",
 								        "'Alice Walls',49",
 								        "'Paula Mais',24",
 								        "'Myrtle Pelt',93",
 								        "'Sylvia Naffziger',18",
 								        "'Amanda Cave',83",
 								        "'Yolanda Joseph',89",
 								    ]
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-03-03 08:56:15 +00:00
+								    run_query(
 								        instance,
 								        f"""CREATE TABLE {name} (name String, id UInt32) ENGINE = S3(
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-04-14 11:21:40 +00:00
+								                                'http://{started_cluster.minio_ip}:{MINIO_INTERNAL_PORT}/{bucket}/{filename}',
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-03-03 08:56:15 +00:00
+								                                'CSV',
 								                                '{method}')""",
 								    )
-												Added test for GZIP in S3 storage.

											
										
										
											2020-09-28 23:30:41 +00:00
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-03-03 08:56:15 +00:00
+								    run_query(instance, f"INSERT INTO {name} VALUES ({'),('.join(data)})")
-												Fixed bug in GZIP compression in S3 storage.

											
										
										
											2020-09-30 13:09:55 +00:00
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-03-03 08:56:15 +00:00
+								    run_query(instance, f"SELECT sum(id) FROM {name}").splitlines() == ["708"]
-												Added test for GZIP in S3 storage.

											
										
										
											2020-09-28 23:30:41 +00:00
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-03-03 08:56:15 +00:00
+								    buf = io.BytesIO(
 								        get_s3_file_content(started_cluster, bucket, filename, decode=False)
-												Apply black formatter to all *.py files in the repo

											
										
										
											2022-03-22 16:39:58 +00:00
+								    )
-												Merge remote-tracking branch 'origin' into integration-2

											
										
										
											2021-03-03 08:56:15 +00:00
+								    f = gzip.GzipFile(fileobj=buf, mode="rb")
 								    uncompressed_content = f.read().decode()
-												Maybe Minio starts for too long in tests

											
										
										
											2021-06-06 09:38:49 +00:00
+								    assert sum([int(i.split(",")[1]) for i in uncompressed_content.splitlines()]) == 708
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								    run_query(instance, f"DROP TABLE {name}")
-												Truncate for s3

											
										
										
											2021-06-21 15:44:36 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_truncate_table(started_cluster):
-												Truncate for s3

											
										
										
											2021-06-21 15:44:36 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
 								    name = "truncate"
 								    instance.query(
 								        "CREATE TABLE {} (id UInt32) ENGINE = S3('http://{}:{}/{}/{}', 'CSV')".format(
 								            name, started_cluster.minio_ip, MINIO_INTERNAL_PORT, bucket, name
 								        )
-												Apply black formatter to all *.py files in the repo

											
										
										
											2022-03-22 16:39:58 +00:00
+								    )
-												Truncate for s3

											
										
										
											2021-06-21 15:44:36 +00:00
 								    instance.query("INSERT INTO {} SELECT number FROM numbers(10)".format(name))
 								    result = instance.query("SELECT * FROM {}".format(name))
 								    assert result == instance.query("SELECT number FROM numbers(10)")
 								    instance.query("TRUNCATE TABLE {}".format(name))
 								    minio = started_cluster.minio_client
 								    timeout = 30
 								    while timeout > 0:
 								        if (
 								            len(list(minio.list_objects(started_cluster.minio_bucket, "truncate/")))
 								            == 0
 								        ):
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								            break
-												Truncate for s3

											
										
										
											2021-06-21 15:44:36 +00:00
+								        timeout -= 1
 								        time.sleep(1)
 								    assert len(list(minio.list_objects(started_cluster.minio_bucket, "truncate/"))) == 0
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								    # FIXME: there was a bug in test and it was never checked.
 								    # Currently read from truncated table fails with
 								    # DB::Exception: Failed to get object info: No response body..
 								    # HTTP response code: 404: while reading truncate: While executing S3Source
 								    # assert instance.query("SELECT * FROM {}".format(name)) == ""
 								    instance.query(f"DROP TABLE {name} SYNC")
 								    assert (
 								        instance.query(f"SELECT count() FROM system.tables where name='{name}'")
 								        == "0\n"
 								    )
-												Truncate for s3

											
										
										
											2021-06-21 15:44:36 +00:00
-												Tests, url table function

											
										
										
											2021-09-08 19:28:22 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_predefined_connection_configuration(started_cluster):
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								    id = uuid.uuid4()
-												Tests, url table function

											
										
										
											2021-09-08 19:28:22 +00:00
+								    bucket = started_cluster.minio_bucket
-												Fix black check

											
										
										
											2023-06-13 11:32:56 +00:00
+								    instance = started_cluster.instances[
 								        "dummy_without_named_collections"
 								    ]  # type: ClickHouseInstance
-												Tests, url table function

											
										
										
											2021-09-08 19:28:22 +00:00
+								    name = "test_table"
-												Add USE NAMED COLLECTION access

											
										
										
											2023-06-06 12:46:34 +00:00
+								    instance.query("CREATE USER user")
 								    instance.query("GRANT CREATE ON *.* TO user")
 								    instance.query("GRANT SOURCES ON *.* TO user")
 								    instance.query("GRANT SELECT ON *.* TO user")
 								    instance.query(f"drop table if exists {name}", user="user")
 								    error = instance.query_and_get_error(
-												Fix fast test, fix black check

											
										
										
											2023-06-06 16:53:31 +00:00
+								        f"CREATE TABLE {name} (id UInt32) ENGINE = S3(s3_conf1, format='CSV')",
 								        user="user",
 								    )
 								    assert (
-												Fix review comments

											
										
										
											2023-08-06 14:27:45 +00:00
+								        "To execute this query, it's necessary to have the grant NAMED COLLECTION ON s3_conf1"
-												Fix fast test, fix black check

											
										
										
											2023-06-06 16:53:31 +00:00
+								        in error
-												Add USE NAMED COLLECTION access

											
										
										
											2023-06-06 12:46:34 +00:00
+								    )
-												Better

											
										
										
											2023-06-13 10:40:53 +00:00
+								    instance.query("GRANT NAMED COLLECTION ON s3_conf1 TO user", user="admin")
-												Tests, url table function

											
										
										
											2021-09-08 19:28:22 +00:00
+								    instance.query(
-												Fix fast test, fix black check

											
										
										
											2023-06-06 16:53:31 +00:00
+								        f"CREATE TABLE {name} (id UInt32) ENGINE = S3(s3_conf1, format='CSV')",
 								        user="user",
-												Apply black formatter to all *.py files in the repo

											
										
										
											2022-03-22 16:39:58 +00:00
+								    )
-												Tests, url table function

											
										
										
											2021-09-08 19:28:22 +00:00
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								    instance.query(
 								        f"INSERT INTO {name} SELECT number FROM numbers(10) SETTINGS s3_truncate_on_insert=1"
 								    )
-												Add USE NAMED COLLECTION access

											
										
										
											2023-06-06 12:46:34 +00:00
+								    result = instance.query(f"SELECT * FROM {name}")
-												Tests, url table function

											
										
										
											2021-09-08 19:28:22 +00:00
+								    assert result == instance.query("SELECT number FROM numbers(10)")
 								    result = instance.query(
-												Add USE NAMED COLLECTION access

											
										
										
											2023-06-06 12:46:34 +00:00
+								        "SELECT * FROM s3(s3_conf1, format='CSV', structure='id UInt32')", user="user"
-												Tests, url table function

											
										
										
											2021-09-08 19:28:22 +00:00
+								    )
 								    assert result == instance.query("SELECT number FROM numbers(10)")
-												Reduce memory usage for some formats

											
										
										
											2021-10-31 19:53:24 +00:00
-												Add USE NAMED COLLECTION access

											
										
										
											2023-06-06 12:46:34 +00:00
+								    error = instance.query_and_get_error("SELECT * FROM s3(no_collection)", user="user")
-												Better

											
										
										
											2023-06-13 10:40:53 +00:00
+								    assert (
-												Fix review comments

											
										
										
											2023-08-06 14:27:45 +00:00
+								        "To execute this query, it's necessary to have the grant NAMED COLLECTION ON no_collection"
-												Better

											
										
										
											2023-06-13 10:40:53 +00:00
+								        in error
 								    )
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								    instance2 = started_cluster.instances["dummy"]  # has named collection access
 								    error = instance2.query_and_get_error("SELECT * FROM s3(no_collection)")
-												Add USE NAMED COLLECTION access

											
										
										
											2023-06-06 12:46:34 +00:00
+								    assert "There is no named collection `no_collection`" in error
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								    instance.query("DROP USER user")
 								    instance.query(f"DROP TABLE {name}")
-												Better error message

											
										
										
											2022-12-22 14:29:35 +00:00
-												Reduce memory usage for some formats

											
										
										
											2021-10-31 19:53:24 +00:00
-												add test

											
										
										
											2021-11-09 20:11:02 +00:00
+								result = ""
-												Apply black formatter to all *.py files in the repo

											
										
										
											2022-03-22 16:39:58 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_url_reconnect_in_the_middle(started_cluster):
-												add test

											
										
										
											2021-11-09 20:11:02 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
 								    table_format = "id String, data String"
 								    filename = "test_url_reconnect_{}.tsv".format(random.randint(0, 1000))
 								    instance.query(
 								        f"""insert into table function
 								                   s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'TSV', '{table_format}')
 								                   select number, randomPrintableASCII(number % 1000) from numbers(1000000)"""
 								    )
 								    with PartitionManager() as pm:
 								        pm_rule_reject = {
 								            "probability": 0.02,
 								            "destination": instance.ip_address,
 								            "source_port": started_cluster.minio_port,
 								            "action": "REJECT --reject-with tcp-reset",
 								        }
 								        pm_rule_drop_all = {
 								            "destination": instance.ip_address,
 								            "source_port": started_cluster.minio_port,
 								            "action": "DROP",
-												Apply black formatter to all *.py files in the repo

											
										
										
											2022-03-22 16:39:58 +00:00
+								        }
-												add test

											
										
										
											2021-11-09 20:11:02 +00:00
+								        pm._add_rule(pm_rule_reject)
 								        def select():
 								            global result
 								            result = instance.query(
-												Fix data race in copyFromIStreamWithProgressCallback

											
										
										
											2023-06-26 21:49:44 +00:00
+								                f"""select count(), sum(cityHash64(x)) from (select toUInt64(id) + sleep(0.1) as x from
-												add test

											
										
										
											2021-11-09 20:11:02 +00:00
+								                url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'TSV', '{table_format}')
 								                settings http_max_tries = 10, http_retry_max_backoff_ms=2000, http_send_timeout=1, http_receive_timeout=1)"""
-												Better test url engine

											
										
										
											2021-12-23 16:44:24 +00:00
+								            )
-												Fix data race in copyFromIStreamWithProgressCallback

											
										
										
											2023-06-26 21:49:44 +00:00
+								            assert result == "1000000\t3914219105369203805\n"
-												add test

											
										
										
											2021-11-09 20:11:02 +00:00
 								        thread = threading.Thread(target=select)
 								        thread.start()
 								        time.sleep(4)
 								        pm._add_rule(pm_rule_drop_all)
 								        time.sleep(2)
 								        pm._delete_rule(pm_rule_drop_all)
 								        pm._delete_rule(pm_rule_reject)
 								        thread.join()
-												Fix data race in copyFromIStreamWithProgressCallback

											
										
										
											2023-06-26 21:49:44 +00:00
+								        assert result == "1000000\t3914219105369203805\n"
-												Merge branch 'master' of github.com:ClickHouse/ClickHouse into seekable-read-buffers

											
										
										
											2021-11-13 11:38:57 +00:00
-												Separate tests

											
										
										
											2024-07-24 15:49:17 +00:00
+								# At the time of writing the actual read bytes are respectively 148 and 169, so -10% to not be flaky
-												Automatic style fix

											
										
										
											2024-07-24 16:05:10 +00:00
+								@pytest.mark.parametrize(
 								    "format_name,expected_bytes_read", [("Parquet", 133), ("ORC", 150)]
 								)
-												Separate tests

											
										
										
											2024-07-24 15:49:17 +00:00
+								def test_seekable_formats(started_cluster, format_name, expected_bytes_read):
-												Automatic style fix

											
										
										
											2024-07-24 16:05:10 +00:00
+								    expected_lines = 1500000
-												Reduce memory usage for some formats

											
										
										
											2021-10-31 19:53:24 +00:00
+								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
-												Separate tests

											
										
										
											2024-07-24 15:49:17 +00:00
+								    table_function = f"s3(s3_{format_name.lower()}, structure='a Int32, b String', format='{format_name}')"
-												Fix

											
										
										
											2022-07-04 11:52:53 +00:00
+								    exec_query_with_retry(
 								        instance,
-												Separate tests

											
										
										
											2024-07-24 15:49:17 +00:00
+								        f"INSERT INTO TABLE FUNCTION {table_function} SELECT number, randomString(100) FROM numbers({expected_lines}) settings s3_truncate_on_insert=1",
 								        timeout=300,
-												Fix tests

											
										
										
											2022-01-18 19:26:13 +00:00
+								    )
-												Reduce memory usage for some formats

											
										
										
											2021-10-31 19:53:24 +00:00
 								    result = instance.query(f"SELECT count() FROM {table_function}")
-												Separate tests

											
										
										
											2024-07-24 15:49:17 +00:00
+								    assert int(result) == expected_lines
-												Reduce memory usage for some formats

											
										
										
											2021-10-31 19:53:24 +00:00
-												Fix

											
										
										
											2022-07-04 11:52:53 +00:00
+								    result = instance.query(
-												Fix flaky test_seekable_formats
											
										
										
											2023-06-14 17:38:03 +00:00
+								        f"SELECT count() FROM {table_function} SETTINGS max_memory_usage='60M', max_download_threads=1"
-												Fix

											
										
										
											2022-07-04 11:52:53 +00:00
+								    )
-												Separate tests

											
										
										
											2024-07-24 15:49:17 +00:00
+								    assert int(result) == expected_lines
-												Fix

											
										
										
											2022-07-04 11:52:53 +00:00
 								    instance.query(f"SELECT * FROM {table_function} FORMAT Null")
-												Add one more test

											
										
										
											2021-11-20 12:01:45 +00:00
-												Fix tests

											
										
										
											2021-12-21 00:38:39 +00:00
+								    instance.query("SYSTEM FLUSH LOGS")
-												Add one more test

											
										
										
											2021-11-20 12:01:45 +00:00
+								    result = instance.query(
-												Update test.py
											
										
										
											2022-07-04 18:56:34 +00:00
+								        f"SELECT formatReadableSize(ProfileEvents['ReadBufferFromS3Bytes']) FROM system.query_log WHERE startsWith(query, 'SELECT * FROM s3') AND memory_usage > 0 AND type='QueryFinish' ORDER BY event_time_microseconds DESC LIMIT 1"
-												Add one more test

											
										
										
											2021-11-20 12:01:45 +00:00
+								    )
-												Fix

											
										
										
											2022-07-04 11:52:53 +00:00
+								    result = result.strip()
 								    assert result.endswith("MiB")
-												Fix tests

											
										
										
											2022-03-01 09:38:01 +00:00
+								    result = result[: result.index(".")]
-												Separate tests

											
										
										
											2024-07-24 15:49:17 +00:00
+								    assert int(result) > 140
-												Add one more test

											
										
										
											2021-11-20 12:01:45 +00:00
-												Separate tests

											
										
										
											2024-07-24 15:49:17 +00:00
+								@pytest.mark.parametrize("format_name", ["Parquet", "ORC"])
 								def test_seekable_formats_url(started_cluster, format_name):
-												Add one more test

											
										
										
											2021-11-20 12:01:45 +00:00
+								    bucket = started_cluster.minio_bucket
-												Automatic style fix

											
										
										
											2024-07-24 16:05:10 +00:00
+								    expected_lines = 1500000
-												Fix

											
										
										
											2022-07-04 11:52:53 +00:00
+								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
-												Add one more test

											
										
										
											2021-11-20 12:01:45 +00:00
-												Automatic style fix

											
										
										
											2024-07-24 16:05:10 +00:00
+								    format_name_lower = format_name.lower()
-												Separate tests

											
										
										
											2024-07-24 15:49:17 +00:00
+								    table_function = f"s3(s3_{format_name_lower}, structure='a Int32, b String', format='{format_name}')"
-												Fix

											
										
										
											2022-07-04 11:52:53 +00:00
+								    exec_query_with_retry(
 								        instance,
-												Separate tests

											
										
										
											2024-07-24 15:49:17 +00:00
+								        f"INSERT INTO TABLE FUNCTION {table_function} SELECT number, randomString(100) FROM numbers({expected_lines}) settings s3_truncate_on_insert=1",
 								        timeout=300,
-												Fix tests

											
										
										
											2022-01-18 19:26:13 +00:00
+								    )
-												Add one more test

											
										
										
											2021-11-20 12:01:45 +00:00
 								    result = instance.query(f"SELECT count() FROM {table_function}")
-												Separate tests

											
										
										
											2024-07-24 15:49:17 +00:00
+								    assert int(result) == expected_lines
-												Add one more test

											
										
										
											2021-11-20 12:01:45 +00:00
-												Separate tests

											
										
										
											2024-07-24 15:49:17 +00:00
+								    url_function = f"url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_{format_name_lower}', '{format_name}', 'a Int32, b String')"
-												Add one more test

											
										
										
											2021-11-20 12:01:45 +00:00
+								    result = instance.query(
-												Separate tests

											
										
										
											2024-07-24 15:49:17 +00:00
+								        f"SELECT count() FROM {url_function} SETTINGS max_memory_usage='60M'"
-												Add one more test

											
										
										
											2021-11-20 12:01:45 +00:00
+								    )
-												Separate tests

											
										
										
											2024-07-24 15:49:17 +00:00
+								    assert int(result) == expected_lines
-												Fix

											
										
										
											2021-12-22 08:42:23 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_empty_file(started_cluster):
-												Fix

											
										
										
											2021-12-22 08:42:23 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
 								    name = "empty"
 								    url = f"http://{started_cluster.minio_ip}:{MINIO_INTERNAL_PORT}/{bucket}/{name}"
 								    minio = started_cluster.minio_client
 								    minio.put_object(bucket, name, io.BytesIO(b""), 0)
 								    table_function = f"s3('{url}', 'CSV', 'id Int32')"
 								    result = instance.query(f"SELECT count() FROM {table_function}")
 								    assert int(result) == 0
-												Don't allow to write into S3 if path contains globs

											
										
										
											2021-12-24 14:13:35 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_insert_with_path_with_globs(started_cluster):
-												Don't allow to write into S3 if path contains globs

											
										
										
											2021-12-24 14:13:35 +00:00
+								    instance = started_cluster.instances["dummy"]
 								    table_function_3 = f"s3('http://minio1:9001/root/test_parquet*', 'minio', 'minio123', 'Parquet', 'a Int32, b String')"
 								    instance.query_and_get_error(
 								        f"insert into table function {table_function_3} SELECT number, randomString(100) FROM numbers(500)"
 								    )
-												Add more tests and fixes

											
										
										
											2021-12-17 15:34:13 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_s3_schema_inference(started_cluster):
-												Add more tests and fixes

											
										
										
											2021-12-17 15:34:13 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
-												Fix tests

											
										
										
											2021-12-20 14:39:15 +00:00
+								    instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"insert into table function s3(s3_native, structure='a Int32, b String', format='Native') select number, randomString(100) from numbers(5000000) SETTINGS s3_truncate_on_insert=1"
-												Fix tests

											
										
										
											2021-12-20 14:39:15 +00:00
+								    )
 								    result = instance.query(f"desc s3(s3_native, format='Native')")
-												Add more tests and fixes

											
										
										
											2021-12-17 15:34:13 +00:00
+								    assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n"
-												Fix tests

											
										
										
											2021-12-20 14:39:15 +00:00
+								    result = instance.query(f"select count(*) from s3(s3_native, format='Native')")
-												Add more tests and fixes

											
										
										
											2021-12-17 15:34:13 +00:00
+								    assert int(result) == 5000000
-												Fix tests

											
										
										
											2021-12-20 14:39:15 +00:00
+								    instance.query(
 								        f"create table schema_inference engine=S3(s3_native, format='Native')"
 								    )
-												Add more tests and fixes

											
										
										
											2021-12-17 15:34:13 +00:00
+								    result = instance.query(f"desc schema_inference")
 								    assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n"
 								    result = instance.query(f"select count(*) from schema_inference")
 								    assert int(result) == 5000000
-												Add test

											
										
										
											2022-02-25 08:32:39 +00:00
-												Better test url engine

											
										
										
											2021-12-23 16:44:24 +00:00
+								    table_function = f"url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_native', 'Native')"
 								    result = instance.query(f"desc {table_function}")
 								    assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n"
 								    result = instance.query(f"select count(*) from {table_function}")
 								    assert int(result) == 5000000
 								    instance.query(
 								        f"create table schema_inference_2 engine=URL('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_native', 'Native')"
 								    )
 								    result = instance.query(f"desc schema_inference_2")
 								    assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n"
 								    result = instance.query(f"select count(*) from schema_inference_2")
 								    assert int(result) == 5000000
-												Fix schema inference for table runction s3

											
										
										
											2022-01-31 16:39:20 +00:00
+								    table_function = f"s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_native', 'Native')"
 								    result = instance.query(f"desc {table_function}")
 								    assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n"
 								    result = instance.query(f"select count(*) from {table_function}")
 								    assert int(result) == 5000000
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								    instance.query("drop table schema_inference")
 								    instance.query("drop table schema_inference_2")
-												Better test url engine

											
										
										
											2021-12-23 16:44:24 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_empty_file(started_cluster):
-												Better test url engine

											
										
										
											2021-12-23 16:44:24 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
 								    name = "empty"
 								    url = f"http://{started_cluster.minio_ip}:{MINIO_INTERNAL_PORT}/{bucket}/{name}"
 								    minio = started_cluster.minio_client
 								    minio.put_object(bucket, name, io.BytesIO(b""), 0)
 								    table_function = f"s3('{url}', 'CSV', 'id Int32')"
 								    result = instance.query(f"SELECT count() FROM {table_function}")
 								    assert int(result) == 0
-												Allow to create new files on insert for File/S3/HDFS engines

											
										
										
											2021-12-29 18:03:15 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_overwrite(started_cluster):
-												Allow to create new files on insert for File/S3/HDFS engines

											
										
										
											2021-12-29 18:03:15 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
 								    table_function = f"s3(s3_parquet, structure='a Int32, b String', format='Parquet')"
 								    instance.query(f"create table test_overwrite as {table_function}")
 								    instance.query(f"truncate table test_overwrite")
 								    instance.query(
 								        f"insert into test_overwrite select number, randomString(100) from numbers(50) settings s3_truncate_on_insert=1"
 								    )
 								    instance.query_and_get_error(
 								        f"insert into test_overwrite select number, randomString(100) from numbers(100)"
 								    )
 								    instance.query(
 								        f"insert into test_overwrite select number, randomString(100) from numbers(200) settings s3_truncate_on_insert=1"
 								    )
 								    result = instance.query(f"select count() from test_overwrite")
 								    assert int(result) == 200
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								    instance.query(f"drop table test_overwrite")
-												Allow to create new files on insert for File/S3/HDFS engines

											
										
										
											2021-12-29 18:03:15 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_create_new_files_on_insert(started_cluster):
-												Allow to create new files on insert for File/S3/HDFS engines

											
										
										
											2021-12-29 18:03:15 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
 								    table_function = f"s3(s3_parquet, structure='a Int32, b String', format='Parquet')"
 								    instance.query(f"create table test_multiple_inserts as {table_function}")
 								    instance.query(f"truncate table test_multiple_inserts")
-												Fix tests

											
										
										
											2022-01-18 19:26:13 +00:00
+								    instance.query(
 								        f"insert into test_multiple_inserts select number, randomString(100) from numbers(10) settings s3_truncate_on_insert=1"
 								    )
-												Allow to create new files on insert for File/S3/HDFS engines

											
										
										
											2021-12-29 18:03:15 +00:00
+								    instance.query(
 								        f"insert into test_multiple_inserts select number, randomString(100) from numbers(20) settings s3_create_new_file_on_insert=1"
 								    )
 								    instance.query(
 								        f"insert into test_multiple_inserts select number, randomString(100) from numbers(30) settings s3_create_new_file_on_insert=1"
 								    )
-												Add test

											
										
										
											2022-02-25 08:32:39 +00:00
-												Allow to create new files on insert for File/S3/HDFS engines

											
										
										
											2021-12-29 18:03:15 +00:00
+								    result = instance.query(f"select count() from test_multiple_inserts")
 								    assert int(result) == 60
 								    instance.query(f"drop table test_multiple_inserts")
 								    table_function = (
 								        f"s3(s3_parquet_gz, structure='a Int32, b String', format='Parquet')"
-												Apply black formatter to all *.py files in the repo

											
										
										
											2022-03-22 16:39:58 +00:00
+								    )
-												Allow to create new files on insert for File/S3/HDFS engines

											
										
										
											2021-12-29 18:03:15 +00:00
+								    instance.query(f"create table test_multiple_inserts as {table_function}")
 								    instance.query(f"truncate table test_multiple_inserts")
-												Fix tests

											
										
										
											2022-01-18 19:26:13 +00:00
+								    instance.query(
 								        f"insert into test_multiple_inserts select number, randomString(100) from numbers(10) settings s3_truncate_on_insert=1"
 								    )
-												Allow to create new files on insert for File/S3/HDFS engines

											
										
										
											2021-12-29 18:03:15 +00:00
+								    instance.query(
 								        f"insert into test_multiple_inserts select number, randomString(100) from numbers(20) settings s3_create_new_file_on_insert=1"
 								    )
 								    instance.query(
 								        f"insert into test_multiple_inserts select number, randomString(100) from numbers(30) settings s3_create_new_file_on_insert=1"
 								    )
-												Add test

											
										
										
											2022-02-25 08:32:39 +00:00
-												Allow to create new files on insert for File/S3/HDFS engines

											
										
										
											2021-12-29 18:03:15 +00:00
+								    result = instance.query(f"select count() from test_multiple_inserts")
 								    assert int(result) == 60
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								    instance.query("drop table test_multiple_inserts")
-												Allow to create new files on insert for File/S3/HDFS engines

											
										
										
											2021-12-29 18:03:15 +00:00
-												Add test

											
										
										
											2022-02-25 08:32:39 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_format_detection(started_cluster):
-												Add more tests

											
										
										
											2022-01-14 11:00:50 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
 								    instance.query(f"create table arrow_table_s3 (x UInt64) engine=S3(s3_arrow)")
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								    instance.query(
 								        f"insert into arrow_table_s3 select 1 settings s3_truncate_on_insert=1"
 								    )
-												Add more tests

											
										
										
											2022-01-14 11:00:50 +00:00
+								    result = instance.query(f"select * from s3(s3_arrow)")
 								    assert int(result) == 1
-												Fix schema inference for table runction s3

											
										
										
											2022-01-31 16:39:20 +00:00
+								    result = instance.query(
 								        f"select * from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.arrow')"
 								    )
 								    assert int(result) == 1
-												add test

											
										
										
											2021-11-09 20:11:02 +00:00
+								    result = instance.query(
-												Add one more signature for s3 table function

											
										
										
											2022-02-10 15:57:02 +00:00
+								        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.arrow')"
-												Apply black formatter to all *.py files in the repo

											
										
										
											2022-03-22 16:39:58 +00:00
+								    )
-												Support schema inference for INSERT INTO FUNCTION

											
										
										
											2022-02-18 16:19:42 +00:00
+								    assert int(result) == 1
-												Fix segfault in schema inference from url

											
										
										
											2022-02-08 10:40:03 +00:00
 								    instance.query(f"create table parquet_table_s3 (x UInt64) engine=S3(s3_parquet2)")
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								    instance.query(
 								        f"insert into parquet_table_s3 select 1 settings s3_truncate_on_insert=1"
 								    )
-												Fix segfault in schema inference from url

											
										
										
											2022-02-08 10:40:03 +00:00
+								    result = instance.query(f"select * from s3(s3_parquet2)")
 								    assert int(result) == 1
 								    result = instance.query(
 								        f"select * from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.parquet')"
 								    )
 								    assert int(result) == 1
 								    result = instance.query(
 								        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.parquet')"
 								    )
 								    assert int(result) == 1
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								    instance.query(f"drop table arrow_table_s3")
 								    instance.query(f"drop table parquet_table_s3")
-												Fix segfault in schema inference from url

											
										
										
											2022-02-08 10:40:03 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_schema_inference_from_globs(started_cluster):
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								    id = uuid.uuid4()
-												Improve schema inference with globs

											
										
										
											2022-02-09 16:14:14 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
 								    instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test1.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select NULL"
-												Improve schema inference with globs

											
										
										
											2022-02-09 16:14:14 +00:00
+								    )
 								    instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test2.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select 0"
-												Improve schema inference with globs

											
										
										
											2022-02-09 16:14:14 +00:00
+								    )
 								    url_filename = "test{1,2}.jsoncompacteachrow"
 								    result = instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"desc url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{url_filename}') settings input_format_json_infer_incomplete_types_as_strings=0"
-												Improve schema inference with globs

											
										
										
											2022-02-09 16:14:14 +00:00
+								    )
-												Fix tests

											
										
										
											2022-07-21 16:54:42 +00:00
+								    assert result.strip() == "c1\tNullable(Int64)"
-												Improve schema inference with globs

											
										
										
											2022-02-09 16:14:14 +00:00
 								    result = instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"select * from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{url_filename}') settings input_format_json_infer_incomplete_types_as_strings=0"
-												Improve schema inference with globs

											
										
										
											2022-02-09 16:14:14 +00:00
+								    )
 								    assert sorted(result.split()) == ["0", "\\N"]
 								    result = instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test*.jsoncompacteachrow') settings input_format_json_infer_incomplete_types_as_strings=0"
-												Improve schema inference with globs

											
										
										
											2022-02-09 16:14:14 +00:00
+								    )
-												Fix tests

											
										
										
											2022-07-21 16:54:42 +00:00
+								    assert result.strip() == "c1\tNullable(Int64)"
-												Improve schema inference with globs

											
										
										
											2022-02-09 16:14:14 +00:00
 								    result = instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test*.jsoncompacteachrow') settings input_format_json_infer_incomplete_types_as_strings=0"
-												Improve schema inference with globs

											
										
										
											2022-02-09 16:14:14 +00:00
+								    )
 								    assert sorted(result.split()) == ["0", "\\N"]
-												Some refactoring around schema inference with globs

											
										
										
											2022-04-13 16:59:04 +00:00
+								    instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test3.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select NULL"
-												Some refactoring around schema inference with globs

											
										
										
											2022-04-13 16:59:04 +00:00
+								    )
 								    url_filename = "test{1,3}.jsoncompacteachrow"
 								    result = instance.query_and_get_error(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{url_filename}') settings schema_inference_use_cache_for_s3=0, input_format_json_infer_incomplete_types_as_strings=0"
-												Some refactoring around schema inference with globs

											
										
										
											2022-04-13 16:59:04 +00:00
+								    )
 								    assert "All attempts to extract table structure from files failed" in result
 								    result = instance.query_and_get_error(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"desc url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{url_filename}') settings schema_inference_use_cache_for_url=0, input_format_json_infer_incomplete_types_as_strings=0"
-												Some refactoring around schema inference with globs

											
										
										
											2022-04-13 16:59:04 +00:00
+								    )
 								    assert "All attempts to extract table structure from files failed" in result
 								    instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test0.jsoncompacteachrow', 'TSV', 'x String') select '[123;]'"
-												Some refactoring around schema inference with globs

											
										
										
											2022-04-13 16:59:04 +00:00
+								    )
 								    result = instance.query_and_get_error(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test*.jsoncompacteachrow') settings schema_inference_use_cache_for_s3=0, input_format_json_infer_incomplete_types_as_strings=0"
-												Some refactoring around schema inference with globs

											
										
										
											2022-04-13 16:59:04 +00:00
+								    )
-												Fix style
											
										
										
											2024-01-26 01:03:12 +00:00
+								    assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in result
-												Some refactoring around schema inference with globs

											
										
										
											2022-04-13 16:59:04 +00:00
 								    url_filename = "test{0,1,2,3}.jsoncompacteachrow"
-												Fix style
											
										
										
											2022-04-20 14:35:24 +00:00
-												Some refactoring around schema inference with globs

											
										
										
											2022-04-13 16:59:04 +00:00
+								    result = instance.query_and_get_error(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"desc url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{url_filename}') settings schema_inference_use_cache_for_url=0, input_format_json_infer_incomplete_types_as_strings=0"
-												Some refactoring around schema inference with globs

											
										
										
											2022-04-13 16:59:04 +00:00
+								    )
-												Fix style
											
										
										
											2024-01-26 01:03:12 +00:00
+								    assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in result
-												Some refactoring around schema inference with globs

											
										
										
											2022-04-13 16:59:04 +00:00
-												Merge branch 'master' into fix-url-globs
											
										
										
											2022-02-16 09:45:31 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_signatures(started_cluster):
-												S3 Session Tokens: Added tests

											
										
										
											2023-12-18 14:31:01 +00:00
+								    session_token = "session token that will not be checked by MiniIO"
-												Add one more signature for s3 table function

											
										
										
											2022-02-10 15:57:02 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
 								    instance.query(f"create table test_signatures (x UInt64) engine=S3(s3_arrow)")
 								    instance.query(f"truncate table test_signatures")
 								    instance.query(f"insert into test_signatures select 1")
 								    result = instance.query(
 								        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.arrow')"
 								    )
 								    assert int(result) == 1
 								    result = instance.query(
 								        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.arrow', 'Arrow', 'x UInt64')"
 								    )
 								    assert int(result) == 1
 								    result = instance.query(
 								        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.arrow', 'minio', 'minio123')"
 								    )
 								    assert int(result) == 1
-												Fix tests
											
										
										
											2024-03-14 12:42:07 +00:00
+								    error = instance.query_and_get_error(
-												S3 Session Tokens: Added tests

											
										
										
											2023-12-18 14:31:01 +00:00
+								        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.arrow', 'minio', 'minio123', '{session_token}')"
 								    )
-												Fix tests
											
										
										
											2024-03-14 12:42:07 +00:00
+								    assert "S3_ERROR" in error
-												S3 Session Tokens: Added tests

											
										
										
											2023-12-18 14:31:01 +00:00
-												Add one more signature for s3 table function

											
										
										
											2022-02-10 15:57:02 +00:00
+								    result = instance.query(
 								        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.arrow', 'Arrow', 'x UInt64', 'auto')"
 								    )
 								    assert int(result) == 1
 								    result = instance.query(
 								        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.arrow', 'minio', 'minio123', 'Arrow')"
 								    )
 								    assert int(result) == 1
-												Fix tests
											
										
										
											2024-03-14 12:42:07 +00:00
+								    error = instance.query_and_get_error(
-												S3 Session Tokens: Added tests

											
										
										
											2023-12-18 14:31:01 +00:00
+								        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.arrow', 'minio', 'minio123', '{session_token}', 'Arrow')"
 								    )
-												Fix tests
											
										
										
											2024-03-14 12:42:07 +00:00
+								    assert "S3_ERROR" in error
-												S3 Session Tokens: Added tests

											
										
										
											2023-12-18 14:31:01 +00:00
-												Fix tests
											
										
										
											2024-03-14 12:42:07 +00:00
+								    error = instance.query_and_get_error(
-												S3 Session Tokens: Added tests

											
										
										
											2023-12-18 14:31:01 +00:00
+								        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.arrow', 'minio', 'minio123', '{session_token}', 'Arrow', 'x UInt64')"
 								    )
-												Fix tests
											
										
										
											2024-03-14 12:42:07 +00:00
+								    assert "S3_ERROR" in error
-												S3 Session Tokens: Added tests

											
										
										
											2023-12-18 14:31:01 +00:00
-												Fix tests
											
										
										
											2024-03-14 12:42:07 +00:00
+								    error = instance.query_and_get_error(
-												S3 Session Tokens: Added tests

											
										
										
											2023-12-18 14:31:01 +00:00
+								        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.arrow', 'minio', 'minio123', '{session_token}', 'Arrow', 'x UInt64', 'auto')"
 								    )
-												Fix tests
											
										
										
											2024-03-14 12:42:07 +00:00
+								    assert "S3_ERROR" in error
-												S3 Session Tokens: Added tests

											
										
										
											2023-12-18 14:31:01 +00:00
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								    instance.query(f"drop table test_signatures")
-												Add test

											
										
										
											2022-02-25 08:32:39 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_select_columns(started_cluster):
-												Add test

											
										
										
											2022-02-25 08:32:39 +00:00
+								    bucket = started_cluster.minio_bucket
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								    id = uuid.uuid4()
-												Add test

											
										
										
											2022-02-25 08:32:39 +00:00
+								    instance = started_cluster.instances["dummy"]
-												Fix test

											
										
										
											2022-03-01 11:07:13 +00:00
+								    name = "test_table2"
-												Add test

											
										
										
											2022-02-25 08:32:39 +00:00
+								    structure = "id UInt32, value1 Int32, value2 Int32"
 								    instance.query(f"drop table if exists {name}")
 								    instance.query(
 								        f"CREATE TABLE {name} ({structure}) ENGINE = S3(s3_conf1, format='Parquet')"
 								    )
 								    limit = 10000000
-												Fix flacky test

											
										
										
											2022-03-02 12:01:15 +00:00
+								    instance.query(
 								        f"INSERT INTO {name} SELECT * FROM generateRandom('{structure}') LIMIT {limit} SETTINGS s3_truncate_on_insert=1"
 								    )
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								    instance.query(f"SELECT value2, '{id}' FROM {name}")
-												Add test

											
										
										
											2022-02-25 08:32:39 +00:00
 								    instance.query("SYSTEM FLUSH LOGS")
 								    result1 = instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"SELECT ProfileEvents['ReadBufferFromS3Bytes'] FROM system.query_log WHERE type='QueryFinish' and query LIKE 'SELECT value2, ''{id}'' FROM {name}'"
-												Add test

											
										
										
											2022-02-25 08:32:39 +00:00
+								    )
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								    instance.query(f"SELECT *, '{id}' FROM {name}")
-												Add test

											
										
										
											2022-02-25 08:32:39 +00:00
+								    instance.query("SYSTEM FLUSH LOGS")
 								    result2 = instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"SELECT ProfileEvents['ReadBufferFromS3Bytes'] FROM system.query_log WHERE type='QueryFinish' and query LIKE 'SELECT *, ''{id}'' FROM {name}'"
-												Add test

											
										
										
											2022-02-25 08:32:39 +00:00
+								    )
-												Fix test

											
										
										
											2023-06-29 12:36:50 +00:00
+								    assert round(int(result2) / int(result1)) == 3
-												Merge master

											
										
										
											2022-03-01 11:06:56 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_insert_select_schema_inference(started_cluster):
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								    id = uuid.uuid4()
-												Support schema inference for INSERT INTO FUNCTION

											
										
										
											2022-02-18 16:19:42 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
-												Fix bug in schema inference in s3 table function (#35176)


											
										
										
											2022-03-10 14:16:07 +00:00
+								    instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test_insert_select.native') select toUInt64(1) as x"
-												Fix bug in schema inference in s3 table function (#35176)


											
										
										
											2022-03-10 14:16:07 +00:00
+								    )
 								    result = instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test_insert_select.native')"
-												Fix tests

											
										
										
											2022-02-28 10:07:29 +00:00
+								    )
 								    assert result.strip() == "x\tUInt64"
-												Support schema inference for INSERT INTO FUNCTION

											
										
										
											2022-02-18 16:19:42 +00:00
-												Fix bug in schema inference in s3 table function (#35176)


											
										
										
											2022-03-10 14:16:07 +00:00
+								    result = instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test_insert_select.native')"
-												Support schema inference for INSERT INTO FUNCTION

											
										
										
											2022-02-18 16:19:42 +00:00
+								    )
 								    assert int(result) == 1
-												Fix crash in ParallelReadBuffer

											
										
										
											2022-04-12 13:54:53 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_parallel_reading_with_memory_limit(started_cluster):
-												Fix crash in ParallelReadBuffer

											
										
										
											2022-04-12 13:54:53 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
 								    instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_memory_limit.native') select * from numbers(1000000) SETTINGS s3_truncate_on_insert=1"
-												Fix crash in ParallelReadBuffer

											
										
										
											2022-04-12 13:54:53 +00:00
+								    )
 								    result = instance.query_and_get_error(
-												Revert reverting "Fix crash in ParallelReadBuffer"

											
										
										
											2022-04-13 20:39:12 +00:00
+								        f"select * from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_memory_limit.native') settings max_memory_usage=1000"
-												Fix crash in ParallelReadBuffer

											
										
										
											2022-04-12 13:54:53 +00:00
+								    )
 								    assert "Memory limit (for query) exceeded" in result
-												Revert reverting "Fix crash in ParallelReadBuffer"

											
										
										
											2022-04-13 20:39:12 +00:00
+								    time.sleep(5)
-												Fix crash in ParallelReadBuffer

											
										
										
											2022-04-12 13:54:53 +00:00
 								    # Check that server didn't crash
 								    result = instance.query("select 1")
 								    assert int(result) == 1
-												Check ORC/Parquet/Arrow format magic bytes before loading file in memory

											
										
										
											2022-04-13 19:27:38 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_wrong_format_usage(started_cluster):
-												Check ORC/Parquet/Arrow format magic bytes before loading file in memory

											
										
										
											2022-04-13 19:27:38 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
 								    instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_wrong_format.native') select * from numbers(10e6) SETTINGS s3_truncate_on_insert=1"
-												Check ORC/Parquet/Arrow format magic bytes before loading file in memory

											
										
										
											2022-04-13 19:27:38 +00:00
+								    )
-												tests: fix test_storage_s3::test_wrong_format_usage flakiness

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>

											
										
										
											2022-08-19 19:51:23 +00:00
+								    # size(test_wrong_format.native) = 10e6*8+16(header) ~= 76MiB
-												Check ORC/Parquet/Arrow format magic bytes before loading file in memory

											
										
										
											2022-04-13 19:27:38 +00:00
-												tests: fix test_storage_s3::test_wrong_format_usage flakiness

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>

											
										
										
											2022-08-19 19:51:23 +00:00
+								    # ensure that not all file will be loaded into memory
-												Check ORC/Parquet/Arrow format magic bytes before loading file in memory

											
										
										
											2022-04-13 19:27:38 +00:00
+								    result = instance.query_and_get_error(
-												tests: fix test_storage_s3::test_wrong_format_usage flakiness

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>

											
										
										
											2022-08-19 19:51:23 +00:00
+								        f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_wrong_format.native', 'Parquet') settings input_format_allow_seeks=0, max_memory_usage='10Mi'"
-												Check ORC/Parquet/Arrow format magic bytes before loading file in memory

											
										
										
											2022-04-13 19:27:38 +00:00
+								    )
 								    assert "Not a Parquet file" in result
-												Implement cache for schema inference for file/s3/hdfs/url

											
										
										
											2022-06-21 13:02:48 +00:00
-												Cache  number of rows in files for count in file/s3/url/hdfs/azure functions

											
										
										
											2023-08-22 11:59:59 +00:00
+								def check_profile_event_for_query(
 								    instance, file, storage_name, started_cluster, bucket, profile_event, amount
 								):
-												Fix style

											
										
										
											2022-06-27 14:04:28 +00:00
+								    instance.query("system flush logs")
-												Cache  number of rows in files for count in file/s3/url/hdfs/azure functions

											
										
										
											2023-08-22 11:59:59 +00:00
+								    query_pattern = f"{storage_name}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file}'".replace(
 								        "'", "\\'"
 								    )
-												Fix flacky schema inference cache tests

											
										
										
											2022-08-24 11:28:56 +00:00
+								    res = int(
 								        instance.query(
-												Cache  number of rows in files for count in file/s3/url/hdfs/azure functions

											
										
										
											2023-08-22 11:59:59 +00:00
+								            f"select ProfileEvents['{profile_event}'] from system.query_log where query like '%{query_pattern}%' and query not like '%ProfileEvents%' and type = 'QueryFinish' order by query_start_time_microseconds desc limit 1"
-												Add some additional information to cache keys

											
										
										
											2022-06-27 12:43:24 +00:00
+								        )
-												Fix flacky schema inference cache tests

											
										
										
											2022-08-24 11:28:56 +00:00
+								    )
-												Fix style

											
										
										
											2022-08-18 17:15:30 +00:00
+								    assert res == amount
-												Implement cache for schema inference for file/s3/hdfs/url

											
										
										
											2022-06-21 13:02:48 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def check_cache_misses(instance, file, storage_name, started_cluster, bucket, amount=1):
-												Cache  number of rows in files for count in file/s3/url/hdfs/azure functions

											
										
										
											2023-08-22 11:59:59 +00:00
+								    check_profile_event_for_query(
 								        instance,
 								        file,
 								        storage_name,
 								        started_cluster,
 								        bucket,
 								        "SchemaInferenceCacheMisses",
 								        amount,
 								    )
-												Implement cache for schema inference for file/s3/hdfs/url

											
										
										
											2022-06-21 13:02:48 +00:00
-												Fix tests

											
										
										
											2022-06-28 16:13:42 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def check_cache_hits(instance, file, storage_name, started_cluster, bucket, amount=1):
-												Cache  number of rows in files for count in file/s3/url/hdfs/azure functions

											
										
										
											2023-08-22 11:59:59 +00:00
+								    check_profile_event_for_query(
 								        instance,
 								        file,
 								        storage_name,
 								        started_cluster,
 								        bucket,
 								        "SchemaInferenceCacheHits",
 								        amount,
 								    )
-												Fix tests

											
										
										
											2022-06-28 16:13:42 +00:00
-												Implement cache for schema inference for file/s3/hdfs/url

											
										
										
											2022-06-21 13:02:48 +00:00
-												Fix check black

											
										
										
											2022-08-11 10:55:18 +00:00
+								def check_cache_invalidations(
 								    instance, file, storage_name, started_cluster, bucket, amount=1
 								):
-												Fix style

											
										
										
											2022-08-18 17:15:30 +00:00
+								    check_profile_event_for_query(
-												Cache  number of rows in files for count in file/s3/url/hdfs/azure functions

											
										
										
											2023-08-22 11:59:59 +00:00
+								        instance,
 								        file,
 								        storage_name,
 								        started_cluster,
 								        bucket,
 								        "SchemaInferenceCacheInvalidations",
 								        amount,
-												Fix style

											
										
										
											2022-08-18 17:15:30 +00:00
+								    )
-												Implement cache for schema inference for file/s3/hdfs/url

											
										
										
											2022-06-21 13:02:48 +00:00
-												Fix check black

											
										
										
											2022-08-11 10:55:18 +00:00
+								def check_cache_evictions(
 								    instance, file, storage_name, started_cluster, bucket, amount=1
 								):
-												Fix style

											
										
										
											2022-08-18 17:15:30 +00:00
+								    check_profile_event_for_query(
-												Cache  number of rows in files for count in file/s3/url/hdfs/azure functions

											
										
										
											2023-08-22 11:59:59 +00:00
+								        instance,
 								        file,
 								        storage_name,
 								        started_cluster,
 								        bucket,
 								        "SchemaInferenceCacheEvictions",
 								        amount,
 								    )
 								def check_cahce_num_rows_hits(
 								    instance, file, storage_name, started_cluster, bucket, amount=1
 								):
 								    check_profile_event_for_query(
 								        instance,
 								        file,
 								        storage_name,
 								        started_cluster,
 								        bucket,
 								        "SchemaInferenceCacheNumRowsHits",
 								        amount,
-												Fix style

											
										
										
											2022-08-18 17:15:30 +00:00
+								    )
-												Implement cache for schema inference for file/s3/hdfs/url

											
										
										
											2022-06-21 13:02:48 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def run_describe_query(instance, file, storage_name, started_cluster, bucket):
 								    query = f"desc {storage_name}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file}')"
 								    instance.query(query)
-												Implement cache for schema inference for file/s3/hdfs/url

											
										
										
											2022-06-21 13:02:48 +00:00
-												Cache  number of rows in files for count in file/s3/url/hdfs/azure functions

											
										
										
											2023-08-22 11:59:59 +00:00
+								def run_count_query(instance, file, storage_name, started_cluster, bucket):
 								    query = f"select count() from {storage_name}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file}', auto, 'x UInt64')"
 								    return instance.query(query)
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def check_cache(instance, expected_files):
 								    sources = instance.query("select source from system.schema_inference_cache")
-												Fix check black

											
										
										
											2022-08-11 10:55:18 +00:00
+								    assert sorted(map(lambda x: x.strip().split("/")[-1], sources.split())) == sorted(
 								        expected_files
 								    )
-												Implement cache for schema inference for file/s3/hdfs/url

											
										
										
											2022-06-21 13:02:48 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								def test_schema_inference_cache(started_cluster):
 								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
-												Implement cache for schema inference for file/s3/hdfs/url

											
										
										
											2022-06-21 13:02:48 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								    def test(storage_name):
-												Try fix s3 test

											
										
										
											2022-08-17 12:25:58 +00:00
+								        instance.query("system drop schema cache")
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								        instance.query(
 								            f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_cache0.jsonl') select * from numbers(100) settings s3_truncate_on_insert=1"
 								        )
 								        time.sleep(1)
-												Fix tests

											
										
										
											2022-06-28 16:13:42 +00:00
-												Fix check black

											
										
										
											2022-08-11 10:55:18 +00:00
+								        run_describe_query(
 								            instance, "test_cache0.jsonl", storage_name, started_cluster, bucket
 								        )
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								        check_cache(instance, ["test_cache0.jsonl"])
-												Fix check black

											
										
										
											2022-08-11 10:55:18 +00:00
+								        check_cache_misses(
 								            instance, "test_cache0.jsonl", storage_name, started_cluster, bucket
 								        )
-												Implement cache for schema inference for file/s3/hdfs/url

											
										
										
											2022-06-21 13:02:48 +00:00
-												Fix check black

											
										
										
											2022-08-11 10:55:18 +00:00
+								        run_describe_query(
 								            instance, "test_cache0.jsonl", storage_name, started_cluster, bucket
 								        )
 								        check_cache_hits(
 								            instance, "test_cache0.jsonl", storage_name, started_cluster, bucket
 								        )
-												Implement cache for schema inference for file/s3/hdfs/url

											
										
										
											2022-06-21 13:02:48 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								        instance.query(
 								            f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_cache0.jsonl') select * from numbers(100) settings s3_truncate_on_insert=1"
 								        )
 								        time.sleep(1)
-												Implement cache for schema inference for file/s3/hdfs/url

											
										
										
											2022-06-21 13:02:48 +00:00
-												Fix check black

											
										
										
											2022-08-11 10:55:18 +00:00
+								        run_describe_query(
 								            instance, "test_cache0.jsonl", storage_name, started_cluster, bucket
 								        )
 								        check_cache_invalidations(
 								            instance, "test_cache0.jsonl", storage_name, started_cluster, bucket
 								        )
-												Check last modification time for URL function too

											
										
										
											2022-06-21 17:18:14 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								        instance.query(
 								            f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_cache1.jsonl') select * from numbers(100) settings s3_truncate_on_insert=1"
 								        )
 								        time.sleep(1)
-												Implement cache for schema inference for file/s3/hdfs/url

											
										
										
											2022-06-21 13:02:48 +00:00
-												Fix check black

											
										
										
											2022-08-11 10:55:18 +00:00
+								        run_describe_query(
 								            instance, "test_cache1.jsonl", storage_name, started_cluster, bucket
 								        )
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								        check_cache(instance, ["test_cache0.jsonl", "test_cache1.jsonl"])
-												Fix check black

											
										
										
											2022-08-11 10:55:18 +00:00
+								        check_cache_misses(
 								            instance, "test_cache1.jsonl", storage_name, started_cluster, bucket
 								        )
-												Implement cache for schema inference for file/s3/hdfs/url

											
										
										
											2022-06-21 13:02:48 +00:00
-												Fix check black

											
										
										
											2022-08-11 10:55:18 +00:00
+								        run_describe_query(
 								            instance, "test_cache1.jsonl", storage_name, started_cluster, bucket
 								        )
 								        check_cache_hits(
 								            instance, "test_cache1.jsonl", storage_name, started_cluster, bucket
 								        )
-												Check last modification time for URL function too

											
										
										
											2022-06-21 17:18:14 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								        instance.query(
 								            f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_cache2.jsonl') select * from numbers(100) settings s3_truncate_on_insert=1"
 								        )
 								        time.sleep(1)
-												Implement cache for schema inference for file/s3/hdfs/url

											
										
										
											2022-06-21 13:02:48 +00:00
-												Fix check black

											
										
										
											2022-08-11 10:55:18 +00:00
+								        run_describe_query(
 								            instance, "test_cache2.jsonl", storage_name, started_cluster, bucket
 								        )
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								        check_cache(instance, ["test_cache1.jsonl", "test_cache2.jsonl"])
-												Fix check black

											
										
										
											2022-08-11 10:55:18 +00:00
+								        check_cache_misses(
 								            instance, "test_cache2.jsonl", storage_name, started_cluster, bucket
 								        )
 								        check_cache_evictions(
 								            instance, "test_cache2.jsonl", storage_name, started_cluster, bucket
 								        )
 								        run_describe_query(
 								            instance, "test_cache2.jsonl", storage_name, started_cluster, bucket
 								        )
 								        check_cache_hits(
 								            instance, "test_cache2.jsonl", storage_name, started_cluster, bucket
 								        )
 								        run_describe_query(
 								            instance, "test_cache1.jsonl", storage_name, started_cluster, bucket
 								        )
 								        check_cache_hits(
 								            instance, "test_cache1.jsonl", storage_name, started_cluster, bucket
 								        )
 								        run_describe_query(
 								            instance, "test_cache0.jsonl", storage_name, started_cluster, bucket
 								        )
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								        check_cache(instance, ["test_cache0.jsonl", "test_cache1.jsonl"])
-												Fix check black

											
										
										
											2022-08-11 10:55:18 +00:00
+								        check_cache_misses(
 								            instance, "test_cache0.jsonl", storage_name, started_cluster, bucket
 								        )
 								        check_cache_evictions(
 								            instance, "test_cache0.jsonl", storage_name, started_cluster, bucket
 								        )
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
-												Fix check black

											
										
										
											2022-08-11 10:55:18 +00:00
+								        run_describe_query(
 								            instance, "test_cache2.jsonl", storage_name, started_cluster, bucket
 								        )
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								        check_cache(instance, ["test_cache0.jsonl", "test_cache2.jsonl"])
-												Fix check black

											
										
										
											2022-08-11 10:55:18 +00:00
+								        check_cache_misses(
 								            instance, "test_cache2.jsonl", storage_name, started_cluster, bucket
 								        )
-												Try fix s3 test

											
										
										
											2022-08-17 12:25:58 +00:00
+								        check_cache_evictions(
 								            instance, "test_cache2.jsonl", storage_name, started_cluster, bucket
 								        )
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
-												Fix check black

											
										
										
											2022-08-11 10:55:18 +00:00
+								        run_describe_query(
 								            instance, "test_cache2.jsonl", storage_name, started_cluster, bucket
 								        )
 								        check_cache_hits(
 								            instance, "test_cache2.jsonl", storage_name, started_cluster, bucket
 								        )
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
-												Fix check black

											
										
										
											2022-08-11 10:55:18 +00:00
+								        run_describe_query(
 								            instance, "test_cache0.jsonl", storage_name, started_cluster, bucket
 								        )
 								        check_cache_hits(
 								            instance, "test_cache0.jsonl", storage_name, started_cluster, bucket
 								        )
-												Implement cache for schema inference for file/s3/hdfs/url

											
										
										
											2022-06-21 13:02:48 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								        instance.query(
 								            f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_cache3.jsonl') select * from numbers(100) settings s3_truncate_on_insert=1"
 								        )
 								        time.sleep(1)
-												Check last modification time for URL function too

											
										
										
											2022-06-21 17:18:14 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								        files = "test_cache{0,1,2,3}.jsonl"
 								        run_describe_query(instance, files, storage_name, started_cluster, bucket)
 								        check_cache_hits(instance, files, storage_name, started_cluster, bucket)
-												Check last modification time for URL function too

											
										
										
											2022-06-21 17:18:14 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								        instance.query(f"system drop schema cache for {storage_name}")
 								        check_cache(instance, [])
-												Implement cache for schema inference for file/s3/hdfs/url

											
										
										
											2022-06-21 13:02:48 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								        run_describe_query(instance, files, storage_name, started_cluster, bucket)
-												Fix test

											
										
										
											2024-05-23 12:11:30 +00:00
+								        check_cache_misses(instance, files, storage_name, started_cluster, bucket, 4)
-												Fix tests

											
										
										
											2022-06-28 16:13:42 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								        instance.query("system drop schema cache")
 								        check_cache(instance, [])
-												Implement cache for schema inference for file/s3/hdfs/url

											
										
										
											2022-06-21 13:02:48 +00:00
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								        run_describe_query(instance, files, storage_name, started_cluster, bucket)
-												Fix test

											
										
										
											2024-05-23 12:11:30 +00:00
+								        check_cache_misses(instance, files, storage_name, started_cluster, bucket, 4)
-												Implement cache for schema inference for file/s3/hdfs/url

											
										
										
											2022-06-21 13:02:48 +00:00
-												Cache  number of rows in files for count in file/s3/url/hdfs/azure functions

											
										
										
											2023-08-22 11:59:59 +00:00
+								        instance.query("system drop schema cache")
 								        instance.query(
 								            f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_cache0.csv') select * from numbers(100) settings s3_truncate_on_insert=1"
 								        )
 								        time.sleep(1)
 								        res = run_count_query(
 								            instance, "test_cache0.csv", storage_name, started_cluster, bucket
 								        )
 								        assert int(res) == 100
 								        check_cache(instance, ["test_cache0.csv"])
 								        check_cache_misses(
 								            instance, "test_cache0.csv", storage_name, started_cluster, bucket
 								        )
 								        res = run_count_query(
 								            instance, "test_cache0.csv", storage_name, started_cluster, bucket
 								        )
 								        assert int(res) == 100
 								        check_cache_hits(
 								            instance, "test_cache0.csv", storage_name, started_cluster, bucket
 								        )
 								        instance.query(
 								            f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_cache0.csv') select * from numbers(200) settings s3_truncate_on_insert=1"
 								        )
 								        time.sleep(1)
 								        res = run_count_query(
 								            instance, "test_cache0.csv", storage_name, started_cluster, bucket
 								        )
 								        assert int(res) == 200
 								        check_cache_invalidations(
 								            instance, "test_cache0.csv", storage_name, started_cluster, bucket
 								        )
 								        instance.query(
 								            f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_cache1.csv') select * from numbers(100) settings s3_truncate_on_insert=1"
 								        )
 								        time.sleep(1)
 								        res = run_count_query(
 								            instance, "test_cache1.csv", storage_name, started_cluster, bucket
 								        )
 								        assert int(res) == 100
 								        check_cache(instance, ["test_cache0.csv", "test_cache1.csv"])
 								        check_cache_misses(
 								            instance, "test_cache1.csv", storage_name, started_cluster, bucket
 								        )
 								        res = run_count_query(
 								            instance, "test_cache1.csv", storage_name, started_cluster, bucket
 								        )
 								        assert int(res) == 100
 								        check_cache_hits(
 								            instance, "test_cache1.csv", storage_name, started_cluster, bucket
 								        )
 								        res = run_count_query(
 								            instance, "test_cache{0,1}.csv", storage_name, started_cluster, bucket
 								        )
 								        assert int(res) == 300
 								        check_cache_hits(
 								            instance, "test_cache{0,1}.csv", storage_name, started_cluster, bucket, 2
 								        )
 								        instance.query(f"system drop schema cache for {storage_name}")
 								        check_cache(instance, [])
 								        res = run_count_query(
 								            instance, "test_cache{0,1}.csv", storage_name, started_cluster, bucket
 								        )
 								        assert int(res) == 300
 								        check_cache_misses(
 								            instance, "test_cache{0,1}.csv", storage_name, started_cluster, bucket, 2
 								        )
 								        instance.query(f"system drop schema cache for {storage_name}")
 								        check_cache(instance, [])
 								        instance.query(
 								            f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_cache.parquet') select * from numbers(100) settings s3_truncate_on_insert=1"
 								        )
 								        time.sleep(1)
 								        res = instance.query(
 								            f"select count() from {storage_name}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_cache.parquet')"
 								        )
 								        assert int(res) == 100
 								        check_cache_misses(
 								            instance, "test_cache.parquet", storage_name, started_cluster, bucket
 								        )
 								        check_cache_hits(
 								            instance, "test_cache.parquet", storage_name, started_cluster, bucket
 								        )
 								        check_cahce_num_rows_hits(
 								            instance, "test_cache.parquet", storage_name, started_cluster, bucket
 								        )
-												Refactor, remove TTL, add size limit, add system table and system query

											
										
										
											2022-08-05 16:20:15 +00:00
+								    test("s3")
 								    test("url")
-												s3 header auth in ast

											
										
										
											2022-09-13 13:07:43 +00:00
 								def test_ast_auth_headers(started_cluster):
 								    bucket = started_cluster.minio_restricted_bucket
 								    instance = started_cluster.instances["s3_non_default"]  # type: ClickHouseInstance
 								    filename = "test.csv"
 								    result = instance.query_and_get_error(
-												Fix tests
											
										
										
											2023-05-25 10:46:53 +00:00
+								        f"select count() from s3('http://resolver:8080/{bucket}/{filename}', 'CSV', 'dummy String')"
-												s3 header auth in ast

											
										
										
											2022-09-13 13:07:43 +00:00
+								    )
-												fix test

											
										
										
											2022-12-15 04:19:16 +00:00
+								    assert "HTTP response code: 403" in result
-												Update test.py
											
										
										
											2022-09-22 11:11:35 +00:00
+								    assert "S3_ERROR" in result
-												s3 header auth in ast

											
										
										
											2022-09-13 13:07:43 +00:00
 								    result = instance.query(
 								        f"select * from s3('http://resolver:8080/{bucket}/{filename}', 'CSV', headers(Authorization=`Bearer TOKEN`))"
 								    )
 								    assert result.strip() == "1\t2\t3"
-												Fix use_environment_credentials

											
										
										
											2022-09-30 10:24:56 +00:00
 								def test_environment_credentials(started_cluster):
 								    bucket = started_cluster.minio_restricted_bucket
 								    instance = started_cluster.instances["s3_with_environment_credentials"]
 								    instance.query(
 								        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_cache3.jsonl') select * from numbers(100) settings s3_truncate_on_insert=1"
 								    )
 								    assert (
 								        "100"
 								        == instance.query(
 								            f"select count() from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_cache3.jsonl')"
 								        ).strip()
 								    )
-												Fix crash when `ListObjects` request fails (#45371)


											
										
										
											2023-01-20 19:10:23 +00:00
-												Prefer explicitly defined creds for S3

											
										
										
											2023-02-09 08:13:16 +00:00
+								    # manually defined access key should override from env
 								    with pytest.raises(helpers.client.QueryRuntimeException) as ei:
 								        instance.query(
-												Automatic style fix

											
										
										
											2023-02-09 08:26:36 +00:00
+								            f"select count() from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_cache4.jsonl', 'aws', 'aws123')"
 								        )
-												Prefer explicitly defined creds for S3

											
										
										
											2023-02-09 08:13:16 +00:00
 								        assert ei.value.returncode == 243
 								        assert "HTTP response code: 403" in ei.value.stderr
-												Fix crash when `ListObjects` request fails (#45371)


											
										
										
											2023-01-20 19:10:23 +00:00
-												Automatic style fix

											
										
										
											2023-02-09 08:26:36 +00:00
-												Fix crash when `ListObjects` request fails (#45371)


											
										
										
											2023-01-20 19:10:23 +00:00
+								def test_s3_list_objects_failure(started_cluster):
 								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
 								    filename = "test_no_list_{_partition_id}.csv"
 								    put_query = f"""
 								        INSERT INTO TABLE FUNCTION
 								        s3('http://resolver:8083/{bucket}/{filename}', 'CSV', 'c1 UInt32')
 								        PARTITION BY c1 % 20
 								        SELECT number FROM numbers(100)
 								        SETTINGS s3_truncate_on_insert=1
 								    """
 								    run_query(instance, put_query)
 								    T = 10
 								    for _ in range(0, T):
 								        started_cluster.exec_in_container(
 								            started_cluster.get_container_id("resolver"),
 								            [
 								                "curl",
 								                "-X",
 								                "POST",
 								                f"http://localhost:8083/reset_counters?max={random.randint(1, 15)}",
 								            ],
 								        )
 								        get_query = """
 								            SELECT sleep({seconds}) FROM s3('http://resolver:8083/{bucket}/test_no_list_*', 'CSV', 'c1 UInt32')
-												fix race in destructor of ParallelParsingInputFormat

											
										
										
											2023-01-23 01:18:58 +00:00
+								            SETTINGS s3_list_object_keys_size = 1, max_threads = {max_threads}, enable_s3_requests_logging = 1
-												Fix crash when `ListObjects` request fails (#45371)


											
										
										
											2023-01-20 19:10:23 +00:00
+								            """.format(
 								            bucket=bucket, seconds=random.random(), max_threads=random.randint(2, 20)
 								        )
 								        with pytest.raises(helpers.client.QueryRuntimeException) as ei:
 								            result = run_query(instance, get_query)
 								            print(result)
 								        assert ei.value.returncode == 243
 								        assert "Could not list objects" in ei.value.stderr
-												Allow to skip empty files in file/s3/url/hdfs table functions

											
										
										
											2023-05-30 19:32:24 +00:00
 								def test_skip_empty_files(started_cluster):
 								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
 								    instance.query(
 								        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files1.parquet', TSVRaw) select * from numbers(0) settings s3_truncate_on_insert=1"
 								    )
 								    instance.query(
 								        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files2.parquet') select * from numbers(1) settings s3_truncate_on_insert=1"
 								    )
-												Add docs, fix style

											
										
										
											2023-05-31 17:52:29 +00:00
-												Allow to skip empty files in file/s3/url/hdfs table functions

											
										
										
											2023-05-30 19:32:24 +00:00
+								    def test(engine, setting):
 								        instance.query_and_get_error(
 								            f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files1.parquet') settings {setting}=0"
 								        )
-												Add docs, fix style

											
										
										
											2023-05-31 17:52:29 +00:00
-												Allow to skip empty files in file/s3/url/hdfs table functions

											
										
										
											2023-05-30 19:32:24 +00:00
+								        instance.query_and_get_error(
 								            f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files1.parquet', auto, 'number UINt64') settings {setting}=0"
 								        )
-												Add docs, fix style

											
										
										
											2023-05-31 17:52:29 +00:00
-												Allow to skip empty files in file/s3/url/hdfs table functions

											
										
										
											2023-05-30 19:32:24 +00:00
+								        instance.query_and_get_error(
 								            f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files1.parquet') settings {setting}=1"
 								        )
-												Add docs, fix style

											
										
										
											2023-05-31 17:52:29 +00:00
-												Allow to skip empty files in file/s3/url/hdfs table functions

											
										
										
											2023-05-30 19:32:24 +00:00
+								        res = instance.query(
 								            f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files1.parquet', auto, 'number UInt64') settings {setting}=1"
 								        )
-												Add docs, fix style

											
										
										
											2023-05-31 17:52:29 +00:00
-												Allow to skip empty files in file/s3/url/hdfs table functions

											
										
										
											2023-05-30 19:32:24 +00:00
+								        assert len(res) == 0
 								        instance.query_and_get_error(
 								            f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files{{1,2}}.parquet') settings {setting}=0"
 								        )
-												Add docs, fix style

											
										
										
											2023-05-31 17:52:29 +00:00
-												Allow to skip empty files in file/s3/url/hdfs table functions

											
										
										
											2023-05-30 19:32:24 +00:00
+								        instance.query_and_get_error(
 								            f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files{{1,2}}.parquet', auto, 'number UInt64') settings {setting}=0"
 								        )
-												Add docs, fix style

											
										
										
											2023-05-31 17:52:29 +00:00
-												Allow to skip empty files in file/s3/url/hdfs table functions

											
										
										
											2023-05-30 19:32:24 +00:00
+								        res = instance.query(
 								            f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files{{1,2}}.parquet') settings {setting}=1"
 								        )
-												Add docs, fix style

											
										
										
											2023-05-31 17:52:29 +00:00
-												Allow to skip empty files in file/s3/url/hdfs table functions

											
										
										
											2023-05-30 19:32:24 +00:00
+								        assert int(res) == 0
-												Add docs, fix style

											
										
										
											2023-05-31 17:52:29 +00:00
-												Allow to skip empty files in file/s3/url/hdfs table functions

											
										
										
											2023-05-30 19:32:24 +00:00
+								        res = instance.query(
 								            f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files{{1,2}}.parquet', auto, 'number UInt64') settings {setting}=1"
 								        )
-												Add docs, fix style

											
										
										
											2023-05-31 17:52:29 +00:00
-												Allow to skip empty files in file/s3/url/hdfs table functions

											
										
										
											2023-05-30 19:32:24 +00:00
+								        assert int(res) == 0
 								    test("s3", "s3_skip_empty_files")
 								    test("url", "engine_url_skip_empty_files")
-												Fix comments

											
										
										
											2023-06-15 12:59:46 +00:00
 								    res = instance.query(
 								        f"select * from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files{{1|2}}.parquet') settings engine_url_skip_empty_files=1"
 								    )
 								    assert int(res) == 0
 								    res = instance.query(
-												Fix flaky test test_skip_empty_files
											
										
										
											2023-06-26 11:30:51 +00:00
+								        f"select * from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files{{11|1|22}}.parquet', auto, 'number UInt64') settings engine_url_skip_empty_files=1"
-												Fix comments

											
										
										
											2023-06-15 12:59:46 +00:00
+								    )
 								    assert len(res.strip()) == 0
-												Support reading subcolumns from file/s3/hdfs/url/azureBlobStorage table functions

											
										
										
											2023-07-04 16:50:31 +00:00
 								def test_read_subcolumns(started_cluster):
 								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
 								    instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select  ((1, 2), 3) SETTINGS s3_truncate_on_insert=1"
-												Support reading subcolumns from file/s3/hdfs/url/azureBlobStorage table functions

											
										
										
											2023-07-04 16:50:31 +00:00
+								    )
 								    instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.jsonl', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select  ((1, 2), 3)  SETTINGS s3_truncate_on_insert=1"
-												Support reading subcolumns from file/s3/hdfs/url/azureBlobStorage table functions

											
										
										
											2023-07-04 16:50:31 +00:00
+								    )
 								    res = instance.query(
 								        f"select a.b.d, _path, a.b, _file, a.e from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')"
 								    )
 								    assert res == "2\troot/test_subcolumns.tsv\t(1,2)\ttest_subcolumns.tsv\t3\n"
 								    res = instance.query(
 								        f"select a.b.d, _path, a.b, _file, a.e from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.jsonl', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')"
 								    )
 								    assert res == "2\troot/test_subcolumns.jsonl\t(1,2)\ttest_subcolumns.jsonl\t3\n"
 								    res = instance.query(
 								        f"select x.b.d, _path, x.b, _file, x.e from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.jsonl', auto, 'x Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')"
 								    )
 								    assert res == "0\troot/test_subcolumns.jsonl\t(0,0)\ttest_subcolumns.jsonl\t0\n"
 								    res = instance.query(
-												time_virtual_col_tests: rollback tests

											
										
										
											2024-06-10 21:15:22 +00:00
+								        f"select x.b.d, _path, x.b, _file, x.e from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.jsonl', auto, 'x Tuple(b Tuple(c UInt32, d UInt32), e UInt32) default ((42, 42), 42)')"
-												Support reading subcolumns from file/s3/hdfs/url/azureBlobStorage table functions

											
										
										
											2023-07-04 16:50:31 +00:00
+								    )
-												time_virtual_col_tests: rollback tests

											
										
										
											2024-06-10 21:15:22 +00:00
+								    assert res == "42\troot/test_subcolumns.jsonl\t(42,42)\ttest_subcolumns.jsonl\t42\n"
-												Support reading subcolumns from file/s3/hdfs/url/azureBlobStorage table functions

											
										
										
											2023-07-04 16:50:31 +00:00
 								    res = instance.query(
 								        f"select a.b.d, _path, a.b, _file, a.e from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')"
 								    )
 								    assert res == "2\t/root/test_subcolumns.tsv\t(1,2)\ttest_subcolumns.tsv\t3\n"
 								    res = instance.query(
 								        f"select a.b.d, _path, a.b, _file, a.e from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.jsonl', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')"
 								    )
 								    assert res == "2\t/root/test_subcolumns.jsonl\t(1,2)\ttest_subcolumns.jsonl\t3\n"
 								    res = instance.query(
 								        f"select x.b.d, _path, x.b, _file, x.e from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.jsonl', auto, 'x Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')"
 								    )
 								    assert res == "0\t/root/test_subcolumns.jsonl\t(0,0)\ttest_subcolumns.jsonl\t0\n"
 								    res = instance.query(
 								        f"select x.b.d, _path, x.b, _file, x.e from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.jsonl', auto, 'x Tuple(b Tuple(c UInt32, d UInt32), e UInt32) default ((42, 42), 42)')"
 								    )
 								    assert (
 								        res == "42\t/root/test_subcolumns.jsonl\t(42,42)\ttest_subcolumns.jsonl\t42\n"
 								    )
-												Make test integration

											
										
										
											2023-08-22 16:51:09 +00:00
-												time_virtual_col: style check

											
										
										
											2024-06-06 21:21:12 +00:00
-												time_virtual_col_tests: tests reintroduced

											
										
										
											2024-06-10 22:43:32 +00:00
+								def test_read_subcolumn_time(started_cluster):
 								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
 								    instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumn_time.tsv', auto, 'a UInt32') select  (42)  SETTINGS s3_truncate_on_insert=1"
-												time_virtual_col_tests: tests reintroduced

											
										
										
											2024-06-10 22:43:32 +00:00
+								    )
 								    res = instance.query(
-												time_virtual_col_tests: just one column in test_read_subcolumn_time

											
										
										
											2024-06-11 12:50:16 +00:00
+								        f"select a, dateDiff('minute', _time, now()) < 59 from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumn_time.tsv', auto, 'a UInt32')"
-												time_virtual_col_tests: tests reintroduced

											
										
										
											2024-06-10 22:43:32 +00:00
+								    )
-												time_virtual_col_tests: just one column in test_read_subcolumn_time

											
										
										
											2024-06-11 12:50:16 +00:00
+								    assert res == "42\t1\n"
-												Make test integration

											
										
										
											2023-08-22 16:51:09 +00:00
-												time_virtual_col: style check

											
										
										
											2024-06-06 21:21:12 +00:00
-												Make test integration

											
										
										
											2023-08-22 16:51:09 +00:00
+								def test_filtering_by_file_or_path(started_cluster):
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								    id = uuid.uuid4()
-												Make test integration

											
										
										
											2023-08-22 16:51:09 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
 								    instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter1.tsv', auto, 'x UInt64') select 1 SETTINGS s3_truncate_on_insert=1"
-												Make test integration

											
										
										
											2023-08-22 16:51:09 +00:00
+								    )
 								    instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter2.tsv', auto, 'x UInt64') select 2 SETTINGS s3_truncate_on_insert=1"
-												Make test integration

											
										
										
											2023-08-22 16:51:09 +00:00
+								    )
 								    instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter3.tsv', auto, 'x UInt64') select 3 SETTINGS s3_truncate_on_insert=1"
-												Make test integration

											
										
										
											2023-08-22 16:51:09 +00:00
+								    )
 								    instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"select count(), '{id}' from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter*.tsv') where _file = 'test_filter1.tsv'"
-												Make test integration

											
										
										
											2023-08-22 16:51:09 +00:00
+								    )
 								    instance.query("SYSTEM FLUSH LOGS")
 								    result = instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query like '%{id}%' AND type='QueryFinish'"
-												Make test integration

											
										
										
											2023-08-22 16:51:09 +00:00
+								    )
 								    assert int(result) == 1
-												Add union mode for schema inference to infer union schema of files with different schemas

											
										
										
											2023-10-20 20:46:41 +00:00
-												Automatic style fix

											
										
										
											2024-06-21 14:53:45 +00:00
+								    assert 0 == int(
 								        instance.query(
 								            f"select count() from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter*.tsv') where _file = 'kek'"
 								        )
 								    )
-												Add a test

											
										
										
											2024-06-21 14:46:03 +00:00
-												Add union mode for schema inference to infer union schema of files with different schemas

											
										
										
											2023-10-20 20:46:41 +00:00
 								def test_union_schema_inference_mode(started_cluster):
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								    id = uuid.uuid4()
-												Add union mode for schema inference to infer union schema of files with different schemas

											
										
										
											2023-10-20 20:46:41 +00:00
+								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["s3_non_default"]
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								    file_name_prefix = f"test_union_schema_inference_{id}_"
-												Add union mode for schema inference to infer union schema of files with different schemas

											
										
										
											2023-10-20 20:46:41 +00:00
 								    instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}1.jsonl') select 1 as a SETTINGS s3_truncate_on_insert=1"
-												Add union mode for schema inference to infer union schema of files with different schemas

											
										
										
											2023-10-20 20:46:41 +00:00
+								    )
 								    instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}2.jsonl') select 2 as b SETTINGS s3_truncate_on_insert=1"
-												Add union mode for schema inference to infer union schema of files with different schemas

											
										
										
											2023-10-20 20:46:41 +00:00
+								    )
 								    instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}3.jsonl') select 2 as c SETTINGS s3_truncate_on_insert=1"
-												Add union mode for schema inference to infer union schema of files with different schemas

											
										
										
											2023-10-20 20:46:41 +00:00
+								    )
 								    instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								        f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}4.jsonl', TSV) select 'Error' SETTINGS s3_truncate_on_insert=1"
-												Add union mode for schema inference to infer union schema of files with different schemas

											
										
										
											2023-10-20 20:46:41 +00:00
+								    )
 								    for engine in ["s3", "url"]:
 								        instance.query("system drop schema cache for s3")
 								        result = instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								            f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}{{1,2,3}}.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV"
-												Add union mode for schema inference to infer union schema of files with different schemas

											
										
										
											2023-10-20 20:46:41 +00:00
+								        )
 								        assert result == "a\tNullable(Int64)\nb\tNullable(Int64)\nc\tNullable(Int64)\n"
 								        result = instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								            f"select schema_inference_mode, splitByChar('/', source)[-1] as file, schema from system.schema_inference_cache where source like '%{file_name_prefix}%' order by file format TSV"
-												Add union mode for schema inference to infer union schema of files with different schemas

											
										
										
											2023-10-20 20:46:41 +00:00
+								        )
 								        assert (
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								            result == f"UNION\t{file_name_prefix}1.jsonl\ta Nullable(Int64)\n"
 								            f"UNION\t{file_name_prefix}2.jsonl\tb Nullable(Int64)\n"
 								            f"UNION\t{file_name_prefix}3.jsonl\tc Nullable(Int64)\n"
-												Add union mode for schema inference to infer union schema of files with different schemas

											
										
										
											2023-10-20 20:46:41 +00:00
+								        )
 								        result = instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								            f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}{{1,2,3}}.jsonl') order by tuple(*) settings schema_inference_mode='union', describe_compact_output=1 format TSV"
-												Add union mode for schema inference to infer union schema of files with different schemas

											
										
										
											2023-10-20 20:46:41 +00:00
+								        )
 								        assert result == "1\t\\N\t\\N\n" "\\N\t2\t\\N\n" "\\N\t\\N\t2\n"
 								        instance.query(f"system drop schema cache for {engine}")
 								        result = instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								            f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}2.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV"
-												Add union mode for schema inference to infer union schema of files with different schemas

											
										
										
											2023-10-20 20:46:41 +00:00
+								        )
 								        assert result == "b\tNullable(Int64)\n"
 								        result = instance.query(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								            f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}{{1,2,3}}.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV"
-												Add union mode for schema inference to infer union schema of files with different schemas

											
										
										
											2023-10-20 20:46:41 +00:00
+								        )
 								        assert (
 								            result == "a\tNullable(Int64)\n"
 								            "b\tNullable(Int64)\n"
 								            "c\tNullable(Int64)\n"
 								        )
 								        error = instance.query_and_get_error(
-												more changes

											
										
										
											2024-07-23 14:43:49 +00:00
+								            f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}{{1,2,3,4}}.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV"
-												Add union mode for schema inference to infer union schema of files with different schemas

											
										
										
											2023-10-20 20:46:41 +00:00
+								        )
-												Fix tests

											
										
										
											2024-01-24 17:55:31 +00:00
+								        assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in error
-												Try to detect file format automatically during schema inference if it's unknown

											
										
										
											2024-01-22 22:55:50 +00:00
 								def test_s3_format_detection(started_cluster):
 								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
 								    instance.query(
 								        f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection0', 'JSONEachRow', 'x UInt64, y String') select number, 'str_' || toString(number) from numbers(0) settings s3_truncate_on_insert=1"
 								    )
 								    instance.query(
 								        f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection1', 'JSONEachRow', 'x UInt64, y String') select number, 'str_' || toString(number) from numbers(5) settings s3_truncate_on_insert=1"
 								    )
 								    expected_result = instance.query(
 								        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection1', 'JSONEachRow', 'x UInt64, y String')"
 								    )
 								    expected_desc_result = instance.query(
 								        f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection1', 'JSONEachRow')"
 								    )
 								    for engine in ["s3", "url"]:
 								        desc_result = instance.query(
 								            f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection1')"
 								        )
 								        assert desc_result == expected_desc_result
 								        result = instance.query(
 								            f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection1')"
 								        )
 								        assert result == expected_result
 								        result = instance.query(
 								            f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection1', auto, 'x UInt64, y String')"
 								        )
 								        assert result == expected_result
 								        result = instance.query(
 								            f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection{{0,1}}', auto, 'x UInt64, y String')"
 								        )
 								        assert result == expected_result
 								        instance.query(f"system drop schema cache for {engine}")
 								        result = instance.query(
 								            f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection{{0,1}}', auto, 'x UInt64, y String')"
 								        )
 								        assert result == expected_result
-												Respect settings truncate_on_insert/create_new_file_on_insert in s3/hdfs/azure engines

											
										
										
											2024-04-08 20:18:47 +00:00
 								def test_respect_object_existence_on_partitioned_write(started_cluster):
 								    bucket = started_cluster.minio_bucket
 								    instance = started_cluster.instances["dummy"]
 								    instance.query(
 								        f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_partitioned_write42.csv', 'CSV', 'x UInt64') select 42 settings s3_truncate_on_insert=1"
 								    )
 								    result = instance.query(
 								        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_partitioned_write42.csv')"
 								    )
 								    assert int(result) == 42
 								    error = instance.query_and_get_error(
 								        f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_partitioned_write{{_partition_id}}.csv', 'CSV', 'x UInt64') partition by 42 select 42 settings s3_truncate_on_insert=0"
 								    )
 								    assert "BAD_ARGUMENTS" in error
 								    instance.query(
 								        f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_partitioned_write{{_partition_id}}.csv', 'CSV', 'x UInt64') partition by 42 select 43 settings s3_truncate_on_insert=1"
 								    )
 								    result = instance.query(
 								        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_partitioned_write42.csv')"
 								    )
 								    assert int(result) == 43
 								    instance.query(
 								        f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_partitioned_write{{_partition_id}}.csv', 'CSV', 'x UInt64') partition by 42 select 44 settings s3_truncate_on_insert=0, s3_create_new_file_on_insert=1"
 								    )
 								    result = instance.query(
 								        f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_partitioned_write42.1.csv')"
 								    )
 								    assert int(result) == 44