ClickHouse/tests/integration/test_file_cluster/test.py

import logging
import csv
import time

import pytest
from helpers.cluster import ClickHouseCluster
from helpers.test_tools import TSV


logging.getLogger().setLevel(logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler())


@pytest.fixture(scope="module")
def started_cluster():
    try:
        cluster = ClickHouseCluster(__file__)
        cluster.add_instance(
            "s0_0_0",
            main_configs=["configs/cluster.xml"],
            user_configs=["configs/users.xml"],
            macros={"replica": "node1", "shard": "shard1"},
            with_zookeeper=True,
        )
        cluster.add_instance(
            "s0_0_1",
            main_configs=["configs/cluster.xml"],
            user_configs=["configs/users.xml"],
            macros={"replica": "replica2", "shard": "shard1"},
            with_zookeeper=True,
        )
        cluster.add_instance(
            "s0_1_0",
            main_configs=["configs/cluster.xml"],
            user_configs=["configs/users.xml"],
            macros={"replica": "replica1", "shard": "shard2"},
            with_zookeeper=True,
        )

        logging.info("Starting cluster...")
        cluster.start()
        logging.info("Cluster started")

        for node_name in ("s0_0_0", "s0_0_1", "s0_1_0"):
            for i in range(1, 3):
                cluster.instances[node_name].query(
                    f"""
                INSERT INTO TABLE FUNCTION file(
                    'file{i}.csv', 'CSV', 's String, i UInt32') VALUES ('file{i}',{i})
                    """
                )

        yield cluster
    finally:
        cluster.shutdown()


def get_query(select: str, cluster: bool, files_nums: str, order_by="ORDER BY (i, s)"):
    if cluster:
        return f"SELECT {select} from fileCluster('my_cluster', 'file{{{files_nums}}}.csv', 'CSV', 's String, i UInt32') {order_by}"
    else:
        return f"SELECT {select} from file('file{{{files_nums}}}.csv', 'CSV', 's String, i UInt32') {order_by}"


def test_select_all(started_cluster):
    node = started_cluster.instances["s0_0_0"]

    local = node.query(get_query("*", False, "1,2"))
    distributed = node.query(get_query("*", True, "1,2"))

    assert TSV(local) == TSV(distributed)


def test_count(started_cluster):
    node = started_cluster.instances["s0_0_0"]

    local = node.query(get_query("count(*)", False, "1,2", ""))
    distributed = node.query(get_query("count(*)", True, "1,2", ""))

    assert TSV(local) == TSV(distributed)


def test_non_existent_cluster(started_cluster):
    node = started_cluster.instances["s0_0_0"]
    error = node.query_and_get_error(
        """
    SELECT count(*) from fileCluster(
        'non_existent_cluster', 'file{1,2}.csv', 'CSV', 's String, i UInt32')
    UNION ALL
    SELECT count(*) from fileCluster(
        'non_existent_cluster', 'file{1,2}.csv', 'CSV', 's String, i UInt32')
    """
    )

    assert "not found" in error


def test_missing_file(started_cluster):
    """
    Select from a list of files, _some_ of them don't exist
    """
    node = started_cluster.instances["s0_0_0"]

    local_with_missing_file = node.query(get_query("*", False, "1,2,3"))
    local_wo_missing_file = node.query(get_query("*", False, "1,2"))

    distributed_with_missing_file = node.query(get_query("*", True, "1,2,3"))
    distributed_wo_missing_file = node.query(get_query("*", True, "1,2"))

    assert TSV(local_with_missing_file) == TSV(distributed_with_missing_file)
    assert TSV(local_wo_missing_file) == TSV(distributed_wo_missing_file)
    assert TSV(local_with_missing_file) == TSV(distributed_wo_missing_file)
    assert TSV(local_wo_missing_file) == TSV(distributed_with_missing_file)


def test_no_such_files(started_cluster):
    """
    Select from a list of files, _none_ of them don't exist
    """
    node = started_cluster.instances["s0_0_0"]

    local = node.query(get_query("*", False, "3,4"))
    distributed = node.query(get_query("*", True, "3,4"))

    assert TSV(local) == TSV(distributed)


def test_schema_inference(started_cluster):
    node = started_cluster.instances["s0_0_0"]

    expected_result = node.query(
        "select * from file('file*.csv', 'CSV', 's String, i UInt32') ORDER BY (i, s)"
    )
    result = node.query(
        "select * from fileCluster('my_cluster', 'file*.csv') ORDER BY (c1, c2)"
    )
    assert result == expected_result

    result = node.query(
        "select * from fileCluster('my_cluster', 'file*.csv', auto) ORDER BY (c1, c2)"
    )
    assert result == expected_result

    result = node.query(
        "select * from fileCluster('my_cluster', 'file*.csv', CSV) ORDER BY (c1, c2)"
    )
    assert result == expected_result

    result = node.query(
        "select * from fileCluster('my_cluster', 'file*.csv', auto, auto) ORDER BY (c1, c2)"
    )
    assert result == expected_result

    result = node.query(
        "select * from fileCluster('my_cluster', 'file*.csv', CSV, auto) ORDER BY (c1, c2)"
    )
    assert result == expected_result

    result = node.query(
        "select * from fileCluster('my_cluster', 'file*.csv', auto, auto, auto) ORDER BY (c1, c2)"
    )
    assert result == expected_result

    result = node.query(
        "select * from fileCluster('my_cluster', 'file*.csv', CSV, auto, auto) ORDER BY (c1, c2)"
    )
    assert result == expected_result


def test_format_detection(started_cluster):
    for node_name in ("s0_0_0", "s0_0_1", "s0_1_0"):
        for i in range(1, 3):
            started_cluster.instances[node_name].query(
                f"""
                INSERT INTO TABLE FUNCTION file(
                    'file_for_format_detection_{i}', 'CSV', 's String, i UInt32') VALUES ('file{i}',{i})
                    """
            )

    node = started_cluster.instances["s0_0_0"]
    expected_result = node.query(
        "select * from file('file_for_format_detection*', 'CSV', 's String, i UInt32') ORDER BY (i, s)"
    )

    result = node.query(
        "select * from fileCluster('my_cluster', 'file_for_format_detection*') ORDER BY (c1, c2)"
    )
    assert result == expected_result

    result = node.query(
        "select * from fileCluster('my_cluster', 'file_for_format_detection*', auto) ORDER BY (c1, c2)"
    )
    assert result == expected_result

    result = node.query(
        "select * from fileCluster('my_cluster', 'file_for_format_detection*', auto, auto) ORDER BY (c1, c2)"
    )
    assert result == expected_result

    result = node.query(
        "select * from fileCluster('my_cluster', 'file_for_format_detection*', auto, 's String, i UInt32') ORDER BY (i, s)"
    )
    assert result == expected_result

    result = node.query(
        "select * from fileCluster('my_cluster', 'file_for_format_detection*', auto, auto, auto) ORDER BY (c1, c2)"
    )
    assert result == expected_result

    result = node.query(
        "select * from fileCluster('my_cluster', 'file_for_format_detection*', auto, 's String, i UInt32', auto) ORDER BY (i, s)"
    )
    assert result == expected_result
Introduced fileCluster table function Added fileCluster function Added test and docs 2023-11-16 10:25:28 +00:00			`import logging`
			`import csv`
			`import time`

			`import pytest`
			`from helpers.cluster import ClickHouseCluster`
			`from helpers.test_tools import TSV`


			`logging.getLogger().setLevel(logging.INFO)`
			`logging.getLogger().addHandler(logging.StreamHandler())`


			`@pytest.fixture(scope="module")`
			`def started_cluster():`
			`try:`
			`cluster = ClickHouseCluster(__file__)`
			`cluster.add_instance(`
			`"s0_0_0",`
			`main_configs=["configs/cluster.xml"],`
			`user_configs=["configs/users.xml"],`
			`macros={"replica": "node1", "shard": "shard1"},`
			`with_zookeeper=True,`
			`)`
			`cluster.add_instance(`
			`"s0_0_1",`
			`main_configs=["configs/cluster.xml"],`
			`user_configs=["configs/users.xml"],`
			`macros={"replica": "replica2", "shard": "shard1"},`
			`with_zookeeper=True,`
			`)`
			`cluster.add_instance(`
			`"s0_1_0",`
			`main_configs=["configs/cluster.xml"],`
			`user_configs=["configs/users.xml"],`
			`macros={"replica": "replica1", "shard": "shard2"},`
			`with_zookeeper=True,`
			`)`

			`logging.info("Starting cluster...")`
			`cluster.start()`
			`logging.info("Cluster started")`

			`for node_name in ("s0_0_0", "s0_0_1", "s0_1_0"):`
			`for i in range(1, 3):`
			`cluster.instances[node_name].query(`
			`f"""`
			`INSERT INTO TABLE FUNCTION file(`
			`'file{i}.csv', 'CSV', 's String, i UInt32') VALUES ('file{i}',{i})`
			`"""`
			`)`

			`yield cluster`
			`finally:`
			`cluster.shutdown()`


			`def get_query(select: str, cluster: bool, files_nums: str, order_by="ORDER BY (i, s)"):`
			`if cluster:`
			`return f"SELECT {select} from fileCluster('my_cluster', 'file{{{files_nums}}}.csv', 'CSV', 's String, i UInt32') {order_by}"`
			`else:`
			`return f"SELECT {select} from file('file{{{files_nums}}}.csv', 'CSV', 's String, i UInt32') {order_by}"`


			`def test_select_all(started_cluster):`
			`node = started_cluster.instances["s0_0_0"]`

			`local = node.query(get_query("*", False, "1,2"))`
			`distributed = node.query(get_query("*", True, "1,2"))`

			`assert TSV(local) == TSV(distributed)`


			`def test_count(started_cluster):`
			`node = started_cluster.instances["s0_0_0"]`

			`local = node.query(get_query("count(*)", False, "1,2", ""))`
			`distributed = node.query(get_query("count(*)", True, "1,2", ""))`

			`assert TSV(local) == TSV(distributed)`


			`def test_non_existent_cluster(started_cluster):`
			`node = started_cluster.instances["s0_0_0"]`
			`error = node.query_and_get_error(`
			`"""`
			`SELECT count(*) from fileCluster(`
			`'non_existent_cluster', 'file{1,2}.csv', 'CSV', 's String, i UInt32')`
			`UNION ALL`
			`SELECT count(*) from fileCluster(`
			`'non_existent_cluster', 'file{1,2}.csv', 'CSV', 's String, i UInt32')`
			`"""`
			`)`

			`assert "not found" in error`


			`def test_missing_file(started_cluster):`
			`"""`
			`Select from a list of files, _some_ of them don't exist`
			`"""`
			`node = started_cluster.instances["s0_0_0"]`

			`local_with_missing_file = node.query(get_query("*", False, "1,2,3"))`
			`local_wo_missing_file = node.query(get_query("*", False, "1,2"))`

			`distributed_with_missing_file = node.query(get_query("*", True, "1,2,3"))`
			`distributed_wo_missing_file = node.query(get_query("*", True, "1,2"))`

			`assert TSV(local_with_missing_file) == TSV(distributed_with_missing_file)`
			`assert TSV(local_wo_missing_file) == TSV(distributed_wo_missing_file)`
			`assert TSV(local_with_missing_file) == TSV(distributed_wo_missing_file)`
			`assert TSV(local_wo_missing_file) == TSV(distributed_with_missing_file)`


			`def test_no_such_files(started_cluster):`
			`"""`
			`Select from a list of files, _none_ of them don't exist`
			`"""`
			`node = started_cluster.instances["s0_0_0"]`

			`local = node.query(get_query("*", False, "3,4"))`
			`distributed = node.query(get_query("*", True, "3,4"))`

			`assert TSV(local) == TSV(distributed)`
Try to detect file format automatically during schema inference if it's unknown 2024-01-22 22:55:50 +00:00

			`def test_schema_inference(started_cluster):`
			`node = started_cluster.instances["s0_0_0"]`

			`expected_result = node.query(`
			`"select * from file('file*.csv', 'CSV', 's String, i UInt32') ORDER BY (i, s)"`
			`)`
			`result = node.query(`
			`"select * from fileCluster('my_cluster', 'file*.csv') ORDER BY (c1, c2)"`
			`)`
			`assert result == expected_result`

			`result = node.query(`
			`"select * from fileCluster('my_cluster', 'file*.csv', auto) ORDER BY (c1, c2)"`
			`)`
			`assert result == expected_result`

			`result = node.query(`
			`"select * from fileCluster('my_cluster', 'file*.csv', CSV) ORDER BY (c1, c2)"`
			`)`
			`assert result == expected_result`

			`result = node.query(`
			`"select * from fileCluster('my_cluster', 'file*.csv', auto, auto) ORDER BY (c1, c2)"`
			`)`
			`assert result == expected_result`

			`result = node.query(`
			`"select * from fileCluster('my_cluster', 'file*.csv', CSV, auto) ORDER BY (c1, c2)"`
			`)`
			`assert result == expected_result`

			`result = node.query(`
			`"select * from fileCluster('my_cluster', 'file*.csv', auto, auto, auto) ORDER BY (c1, c2)"`
			`)`
			`assert result == expected_result`

			`result = node.query(`
			`"select * from fileCluster('my_cluster', 'file*.csv', CSV, auto, auto) ORDER BY (c1, c2)"`
			`)`
			`assert result == expected_result`


			`def test_format_detection(started_cluster):`
			`for node_name in ("s0_0_0", "s0_0_1", "s0_1_0"):`
			`for i in range(1, 3):`
			`started_cluster.instances[node_name].query(`
			`f"""`
			`INSERT INTO TABLE FUNCTION file(`
			`'file_for_format_detection_{i}', 'CSV', 's String, i UInt32') VALUES ('file{i}',{i})`
			`"""`
			`)`

			`node = started_cluster.instances["s0_0_0"]`
			`expected_result = node.query(`
			`"select * from file('file_for_format_detection*', 'CSV', 's String, i UInt32') ORDER BY (i, s)"`
			`)`

			`result = node.query(`
			`"select * from fileCluster('my_cluster', 'file_for_format_detection*') ORDER BY (c1, c2)"`
			`)`
			`assert result == expected_result`

			`result = node.query(`
			`"select * from fileCluster('my_cluster', 'file_for_format_detection*', auto) ORDER BY (c1, c2)"`
			`)`
			`assert result == expected_result`

			`result = node.query(`
			`"select * from fileCluster('my_cluster', 'file_for_format_detection*', auto, auto) ORDER BY (c1, c2)"`
			`)`
			`assert result == expected_result`

			`result = node.query(`
			`"select * from fileCluster('my_cluster', 'file_for_format_detection*', auto, 's String, i UInt32') ORDER BY (i, s)"`
			`)`
			`assert result == expected_result`

			`result = node.query(`
			`"select * from fileCluster('my_cluster', 'file_for_format_detection*', auto, auto, auto) ORDER BY (c1, c2)"`
			`)`
			`assert result == expected_result`

			`result = node.query(`
			`"select * from fileCluster('my_cluster', 'file_for_format_detection*', auto, 's String, i UInt32', auto) ORDER BY (i, s)"`
			`)`
			`assert result == expected_result`