ClickHouse/tests/integration/test_distributed_format/test.py

# pylint: disable=redefined-outer-name
# pylint: disable=unused-argument
# pylint: disable=line-too-long

import pytest
from helpers.cluster import ClickHouseCluster

cluster = ClickHouseCluster(__file__)
node = cluster.add_instance("node", main_configs=["configs/remote_servers.xml"])

cluster_param = pytest.mark.parametrize(
    "cluster",
    [
        ("test_cluster_internal_replication"),
        ("test_cluster_no_internal_replication"),
    ],
)


def get_dist_path(cluster, table, dist_format):
    if dist_format == 0:
        return f"/var/lib/clickhouse/data/test/{table}/default@not_existing:9000"
    if cluster == "test_cluster_internal_replication":
        return f"/var/lib/clickhouse/data/test/{table}/shard1_all_replicas"
    return f"/var/lib/clickhouse/data/test/{table}/shard1_replica1"


@pytest.fixture(scope="module")
def started_cluster():
    try:
        cluster.start()
        node.query("create database test")
        yield cluster

    finally:
        cluster.shutdown()


@cluster_param
def test_single_file(started_cluster, cluster):
    node.query(
        "create table test.distr_1 (x UInt64, s String) engine = Distributed('{}', database, table)".format(
            cluster
        )
    )
    node.query(
        "insert into test.distr_1 values (1, 'a'), (2, 'bb'), (3, 'ccc')",
        settings={"use_compact_format_in_distributed_parts_names": "1"},
    )

    path = get_dist_path(cluster, "distr_1", 1)
    query = f"select * from file('{path}/1.bin', 'Distributed')"
    out = node.exec_in_container(
        ["/usr/bin/clickhouse", "local", "--stacktrace", "-q", query]
    )

    assert out == "1\ta\n2\tbb\n3\tccc\n"

    query = f"""
    create table t (x UInt64, s String) engine = File('Distributed', '{path}/1.bin');
    select * from t;
    """
    out = node.exec_in_container(
        ["/usr/bin/clickhouse", "local", "--stacktrace", "-q", query]
    )

    assert out == "1\ta\n2\tbb\n3\tccc\n"

    node.query("drop table test.distr_1")


@cluster_param
def test_two_files(started_cluster, cluster):
    node.query(
        "create table test.distr_2 (x UInt64, s String) engine = Distributed('{}', database, table)".format(
            cluster
        )
    )
    node.query(
        "insert into test.distr_2 values (0, '_'), (1, 'a')",
        settings={
            "use_compact_format_in_distributed_parts_names": "1",
        },
    )
    node.query(
        "insert into test.distr_2 values (2, 'bb'), (3, 'ccc')",
        settings={
            "use_compact_format_in_distributed_parts_names": "1",
        },
    )

    path = get_dist_path(cluster, "distr_2", 1)
    query = f"select * from file('{path}/{{1,2,3,4}}.bin', 'Distributed') order by x"
    out = node.exec_in_container(
        ["/usr/bin/clickhouse", "local", "--stacktrace", "-q", query]
    )

    assert out == "0\t_\n1\ta\n2\tbb\n3\tccc\n"

    query = f"""
    create table t (x UInt64, s String) engine = File('Distributed', '{path}/{{1,2,3,4}}.bin');
    select * from t order by x;
    """
    out = node.exec_in_container(
        ["/usr/bin/clickhouse", "local", "--stacktrace", "-q", query]
    )

    assert out == "0\t_\n1\ta\n2\tbb\n3\tccc\n"

    node.query("drop table test.distr_2")


@cluster_param
def test_single_file_old(started_cluster, cluster):
    node.query(
        "create table test.distr_3 (x UInt64, s String) engine = Distributed('{}', database, table)".format(
            cluster
        )
    )
    node.query(
        "insert into test.distr_3 values (1, 'a'), (2, 'bb'), (3, 'ccc')",
        settings={
            "use_compact_format_in_distributed_parts_names": "0",
        },
    )

    path = get_dist_path(cluster, "distr_3", 0)
    query = f"select * from file('{path}/1.bin', 'Distributed')"
    out = node.exec_in_container(
        ["/usr/bin/clickhouse", "local", "--stacktrace", "-q", query]
    )

    assert out == "1\ta\n2\tbb\n3\tccc\n"

    query = f"""
    create table t (x UInt64, s String) engine = File('Distributed', '{path}/1.bin');
    select * from t;
    """
    out = node.exec_in_container(
        ["/usr/bin/clickhouse", "local", "--stacktrace", "-q", query]
    )

    assert out == "1\ta\n2\tbb\n3\tccc\n"

    node.query("drop table test.distr_3")
Drop replicas from dirname for internal_replication=true Under use_compact_format_in_distributed_parts_names=1 and internal_replication=true the server encodes all replicas for the directory name for async INSERT into Distributed, and the directory name looks like: shard1_replica1,shard1_replica2,shard3_replica3 This is required for creating connections (to specific replicas only), but in case of internal_replication=true, this can be avoided, since this path will always includes all replicas. This patch replaces all replicas with "_all_replicas" marker. Note, that initial problem was that this path may overflow the NAME_MAX if you will have more then 15 replicas, and the server will fail to create the directory. Also note, that changed directory name should not be a problem, since: - empty directories will be removed since #16729 - and replicas encoded in the directory name is also supported anyway. 2021-06-20 13:50:01 +00:00			`# pylint: disable=redefined-outer-name`
			`# pylint: disable=unused-argument`
			`# pylint: disable=line-too-long`
Added tests 2020-01-05 19:36:14 +00:00
Drop replicas from dirname for internal_replication=true Under use_compact_format_in_distributed_parts_names=1 and internal_replication=true the server encodes all replicas for the directory name for async INSERT into Distributed, and the directory name looks like: shard1_replica1,shard1_replica2,shard3_replica3 This is required for creating connections (to specific replicas only), but in case of internal_replication=true, this can be avoided, since this path will always includes all replicas. This patch replaces all replicas with "_all_replicas" marker. Note, that initial problem was that this path may overflow the NAME_MAX if you will have more then 15 replicas, and the server will fail to create the directory. Also note, that changed directory name should not be a problem, since: - empty directories will be removed since #16729 - and replicas encoded in the directory name is also supported anyway. 2021-06-20 13:50:01 +00:00			`import pytest`
Added tests 2020-01-05 19:36:14 +00:00			`from helpers.cluster import ClickHouseCluster`

			`cluster = ClickHouseCluster(__file__)`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`node = cluster.add_instance("node", main_configs=["configs/remote_servers.xml"])`
Added tests 2020-01-05 19:36:14 +00:00
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`cluster_param = pytest.mark.parametrize(`
			`"cluster",`
			`[`
			`("test_cluster_internal_replication"),`
			`("test_cluster_no_internal_replication"),`
			`],`
			`)`
Update test_distributed_format to cover remote_servers via <node> 2020-06-08 19:15:30 +00:00
Added tests 2020-01-05 19:36:14 +00:00
Drop replicas from dirname for internal_replication=true Under use_compact_format_in_distributed_parts_names=1 and internal_replication=true the server encodes all replicas for the directory name for async INSERT into Distributed, and the directory name looks like: shard1_replica1,shard1_replica2,shard3_replica3 This is required for creating connections (to specific replicas only), but in case of internal_replication=true, this can be avoided, since this path will always includes all replicas. This patch replaces all replicas with "_all_replicas" marker. Note, that initial problem was that this path may overflow the NAME_MAX if you will have more then 15 replicas, and the server will fail to create the directory. Also note, that changed directory name should not be a problem, since: - empty directories will be removed since #16729 - and replicas encoded in the directory name is also supported anyway. 2021-06-20 13:50:01 +00:00			`def get_dist_path(cluster, table, dist_format):`
			`if dist_format == 0:`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`return f"/var/lib/clickhouse/data/test/{table}/default@not_existing:9000"`
			`if cluster == "test_cluster_internal_replication":`
			`return f"/var/lib/clickhouse/data/test/{table}/shard1_all_replicas"`
			`return f"/var/lib/clickhouse/data/test/{table}/shard1_replica1"`
Drop replicas from dirname for internal_replication=true Under use_compact_format_in_distributed_parts_names=1 and internal_replication=true the server encodes all replicas for the directory name for async INSERT into Distributed, and the directory name looks like: shard1_replica1,shard1_replica2,shard3_replica3 This is required for creating connections (to specific replicas only), but in case of internal_replication=true, this can be avoided, since this path will always includes all replicas. This patch replaces all replicas with "_all_replicas" marker. Note, that initial problem was that this path may overflow the NAME_MAX if you will have more then 15 replicas, and the server will fail to create the directory. Also note, that changed directory name should not be a problem, since: - empty directories will be removed since #16729 - and replicas encoded in the directory name is also supported anyway. 2021-06-20 13:50:01 +00:00

Added tests 2020-01-05 19:36:14 +00:00			`@pytest.fixture(scope="module")`
			`def started_cluster():`
			`try:`
			`cluster.start()`
enable more tests 2020-09-22 11:56:40 +00:00			`node.query("create database test")`
Added tests 2020-01-05 19:36:14 +00:00			`yield cluster`

			`finally:`
			`cluster.shutdown()`

Format and optimize imports in integration test files This PR formats all the `*.py` files found under the `tests/integration` folder. It also reorders the imports and cleans up a bunch of unused imports. The formatting also takes care of other things like wrapping lines and fixing spaces and indents such that the tests look more readable. 2020-09-16 04:26:10 +00:00
Update test_distributed_format to cover remote_servers via <node> 2020-06-08 19:15:30 +00:00			`@cluster_param`
			`def test_single_file(started_cluster, cluster):`
Format and optimize imports in integration test files This PR formats all the `*.py` files found under the `tests/integration` folder. It also reorders the imports and cleans up a bunch of unused imports. The formatting also takes care of other things like wrapping lines and fixing spaces and indents such that the tests look more readable. 2020-09-16 04:26:10 +00:00			`node.query(`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`"create table test.distr_1 (x UInt64, s String) engine = Distributed('{}', database, table)".format(`
			`cluster`
			`)`
			`)`
			`node.query(`
			`"insert into test.distr_1 values (1, 'a'), (2, 'bb'), (3, 'ccc')",`
			`settings={"use_compact_format_in_distributed_parts_names": "1"},`
			`)`
Added tests 2020-01-05 19:36:14 +00:00
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`path = get_dist_path(cluster, "distr_1", 1)`
Drop replicas from dirname for internal_replication=true Under use_compact_format_in_distributed_parts_names=1 and internal_replication=true the server encodes all replicas for the directory name for async INSERT into Distributed, and the directory name looks like: shard1_replica1,shard1_replica2,shard3_replica3 This is required for creating connections (to specific replicas only), but in case of internal_replication=true, this can be avoided, since this path will always includes all replicas. This patch replaces all replicas with "_all_replicas" marker. Note, that initial problem was that this path may overflow the NAME_MAX if you will have more then 15 replicas, and the server will fail to create the directory. Also note, that changed directory name should not be a problem, since: - empty directories will be removed since #16729 - and replicas encoded in the directory name is also supported anyway. 2021-06-20 13:50:01 +00:00			`query = f"select * from file('{path}/1.bin', 'Distributed')"`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`out = node.exec_in_container(`
			`["/usr/bin/clickhouse", "local", "--stacktrace", "-q", query]`
			`)`
Added tests 2020-01-05 19:36:14 +00:00
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`assert out == "1\ta\n2\tbb\n3\tccc\n"`
Added tests 2020-01-05 19:36:14 +00:00
Drop replicas from dirname for internal_replication=true Under use_compact_format_in_distributed_parts_names=1 and internal_replication=true the server encodes all replicas for the directory name for async INSERT into Distributed, and the directory name looks like: shard1_replica1,shard1_replica2,shard3_replica3 This is required for creating connections (to specific replicas only), but in case of internal_replication=true, this can be avoided, since this path will always includes all replicas. This patch replaces all replicas with "_all_replicas" marker. Note, that initial problem was that this path may overflow the NAME_MAX if you will have more then 15 replicas, and the server will fail to create the directory. Also note, that changed directory name should not be a problem, since: - empty directories will be removed since #16729 - and replicas encoded in the directory name is also supported anyway. 2021-06-20 13:50:01 +00:00			`query = f"""`
			`create table t (x UInt64, s String) engine = File('Distributed', '{path}/1.bin');`
			`select * from t;`
			`"""`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`out = node.exec_in_container(`
			`["/usr/bin/clickhouse", "local", "--stacktrace", "-q", query]`
			`)`
Added tests 2020-01-05 19:36:14 +00:00
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`assert out == "1\ta\n2\tbb\n3\tccc\n"`
Added tests 2020-01-05 19:36:14 +00:00
fix some tests 2020-01-28 19:39:52 +00:00			`node.query("drop table test.distr_1")`
Added tests 2020-01-05 19:36:14 +00:00

Update test_distributed_format to cover remote_servers via <node> 2020-06-08 19:15:30 +00:00			`@cluster_param`
			`def test_two_files(started_cluster, cluster):`
Format and optimize imports in integration test files This PR formats all the `*.py` files found under the `tests/integration` folder. It also reorders the imports and cleans up a bunch of unused imports. The formatting also takes care of other things like wrapping lines and fixing spaces and indents such that the tests look more readable. 2020-09-16 04:26:10 +00:00			`node.query(`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`"create table test.distr_2 (x UInt64, s String) engine = Distributed('{}', database, table)".format(`
			`cluster`
			`)`
			`)`
			`node.query(`
			`"insert into test.distr_2 values (0, '_'), (1, 'a')",`
			`settings={`
			`"use_compact_format_in_distributed_parts_names": "1",`
			`},`
			`)`
			`node.query(`
			`"insert into test.distr_2 values (2, 'bb'), (3, 'ccc')",`
			`settings={`
			`"use_compact_format_in_distributed_parts_names": "1",`
			`},`
			`)`

			`path = get_dist_path(cluster, "distr_2", 1)`
Drop replicas from dirname for internal_replication=true Under use_compact_format_in_distributed_parts_names=1 and internal_replication=true the server encodes all replicas for the directory name for async INSERT into Distributed, and the directory name looks like: shard1_replica1,shard1_replica2,shard3_replica3 This is required for creating connections (to specific replicas only), but in case of internal_replication=true, this can be avoided, since this path will always includes all replicas. This patch replaces all replicas with "_all_replicas" marker. Note, that initial problem was that this path may overflow the NAME_MAX if you will have more then 15 replicas, and the server will fail to create the directory. Also note, that changed directory name should not be a problem, since: - empty directories will be removed since #16729 - and replicas encoded in the directory name is also supported anyway. 2021-06-20 13:50:01 +00:00			`query = f"select * from file('{path}/{{1,2,3,4}}.bin', 'Distributed') order by x"`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`out = node.exec_in_container(`
			`["/usr/bin/clickhouse", "local", "--stacktrace", "-q", query]`
			`)`
Added tests 2020-01-05 19:36:14 +00:00
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`assert out == "0\t_\n1\ta\n2\tbb\n3\tccc\n"`
Added tests 2020-01-05 19:36:14 +00:00
Drop replicas from dirname for internal_replication=true Under use_compact_format_in_distributed_parts_names=1 and internal_replication=true the server encodes all replicas for the directory name for async INSERT into Distributed, and the directory name looks like: shard1_replica1,shard1_replica2,shard3_replica3 This is required for creating connections (to specific replicas only), but in case of internal_replication=true, this can be avoided, since this path will always includes all replicas. This patch replaces all replicas with "_all_replicas" marker. Note, that initial problem was that this path may overflow the NAME_MAX if you will have more then 15 replicas, and the server will fail to create the directory. Also note, that changed directory name should not be a problem, since: - empty directories will be removed since #16729 - and replicas encoded in the directory name is also supported anyway. 2021-06-20 13:50:01 +00:00			`query = f"""`
			`create table t (x UInt64, s String) engine = File('Distributed', '{path}/{{1,2,3,4}}.bin');`
			`select * from t order by x;`
			`"""`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`out = node.exec_in_container(`
			`["/usr/bin/clickhouse", "local", "--stacktrace", "-q", query]`
			`)`
Added tests 2020-01-05 19:36:14 +00:00
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`assert out == "0\t_\n1\ta\n2\tbb\n3\tccc\n"`
Added tests 2020-01-05 19:36:14 +00:00
fix some tests 2020-01-28 19:39:52 +00:00			`node.query("drop table test.distr_2")`
Add setting for a new format of distributed parts 2020-03-13 18:49:46 +00:00

Update test_distributed_format to cover remote_servers via <node> 2020-06-08 19:15:30 +00:00			`@cluster_param`
			`def test_single_file_old(started_cluster, cluster):`
Format and optimize imports in integration test files This PR formats all the `*.py` files found under the `tests/integration` folder. It also reorders the imports and cleans up a bunch of unused imports. The formatting also takes care of other things like wrapping lines and fixing spaces and indents such that the tests look more readable. 2020-09-16 04:26:10 +00:00			`node.query(`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`"create table test.distr_3 (x UInt64, s String) engine = Distributed('{}', database, table)".format(`
			`cluster`
			`)`
			`)`
			`node.query(`
			`"insert into test.distr_3 values (1, 'a'), (2, 'bb'), (3, 'ccc')",`
			`settings={`
			`"use_compact_format_in_distributed_parts_names": "0",`
			`},`
			`)`
Add setting for a new format of distributed parts 2020-03-13 18:49:46 +00:00
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`path = get_dist_path(cluster, "distr_3", 0)`
Drop replicas from dirname for internal_replication=true Under use_compact_format_in_distributed_parts_names=1 and internal_replication=true the server encodes all replicas for the directory name for async INSERT into Distributed, and the directory name looks like: shard1_replica1,shard1_replica2,shard3_replica3 This is required for creating connections (to specific replicas only), but in case of internal_replication=true, this can be avoided, since this path will always includes all replicas. This patch replaces all replicas with "_all_replicas" marker. Note, that initial problem was that this path may overflow the NAME_MAX if you will have more then 15 replicas, and the server will fail to create the directory. Also note, that changed directory name should not be a problem, since: - empty directories will be removed since #16729 - and replicas encoded in the directory name is also supported anyway. 2021-06-20 13:50:01 +00:00			`query = f"select * from file('{path}/1.bin', 'Distributed')"`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`out = node.exec_in_container(`
			`["/usr/bin/clickhouse", "local", "--stacktrace", "-q", query]`
			`)`
Add setting for a new format of distributed parts 2020-03-13 18:49:46 +00:00
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`assert out == "1\ta\n2\tbb\n3\tccc\n"`
Add setting for a new format of distributed parts 2020-03-13 18:49:46 +00:00
Drop replicas from dirname for internal_replication=true Under use_compact_format_in_distributed_parts_names=1 and internal_replication=true the server encodes all replicas for the directory name for async INSERT into Distributed, and the directory name looks like: shard1_replica1,shard1_replica2,shard3_replica3 This is required for creating connections (to specific replicas only), but in case of internal_replication=true, this can be avoided, since this path will always includes all replicas. This patch replaces all replicas with "_all_replicas" marker. Note, that initial problem was that this path may overflow the NAME_MAX if you will have more then 15 replicas, and the server will fail to create the directory. Also note, that changed directory name should not be a problem, since: - empty directories will be removed since #16729 - and replicas encoded in the directory name is also supported anyway. 2021-06-20 13:50:01 +00:00			`query = f"""`
			`create table t (x UInt64, s String) engine = File('Distributed', '{path}/1.bin');`
			`select * from t;`
			`"""`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`out = node.exec_in_container(`
			`["/usr/bin/clickhouse", "local", "--stacktrace", "-q", query]`
			`)`
Add setting for a new format of distributed parts 2020-03-13 18:49:46 +00:00
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`assert out == "1\ta\n2\tbb\n3\tccc\n"`
Add setting for a new format of distributed parts 2020-03-13 18:49:46 +00:00
fixes 2020-03-18 17:38:52 +00:00			`node.query("drop table test.distr_3")`