ClickHouse/tests/integration/test_distributed_format/test.py

# pylint: disable=redefined-outer-name
# pylint: disable=unused-argument
# pylint: disable=line-too-long

import pytest

from helpers.cluster import ClickHouseCluster

cluster = ClickHouseCluster(__file__)
node = cluster.add_instance(
    "node",
    main_configs=["configs/remote_servers.xml", "configs/another_remote_servers.xml"],
    stay_alive=True,
)

cluster_param = pytest.mark.parametrize(
    "cluster",
    [
        ("test_cluster_internal_replication"),
        ("test_cluster_no_internal_replication"),
    ],
)


def get_dist_path(cluster, table, dist_format):
    if dist_format == 0:
        return f"/var/lib/clickhouse/data/test/{table}/default@not_existing:9000"
    if cluster == "test_cluster_internal_replication":
        return f"/var/lib/clickhouse/data/test/{table}/shard1_all_replicas"
    return f"/var/lib/clickhouse/data/test/{table}/shard1_replica1"


@pytest.fixture(scope="module")
def started_cluster():
    try:
        cluster.start()
        node.query("create database test")
        yield cluster

    finally:
        cluster.shutdown()


@cluster_param
def test_single_file(started_cluster, cluster):
    node.query(
        "create table test.distr_1 (x UInt64, s String) engine = Distributed('{}', database, table)".format(
            cluster
        )
    )
    node.query(
        "insert into test.distr_1 values (1, 'a'), (2, 'bb'), (3, 'ccc')",
        settings={"use_compact_format_in_distributed_parts_names": "1"},
    )

    path = get_dist_path(cluster, "distr_1", 1)
    query = f"select * from file('{path}/1.bin', 'Distributed')"
    out = node.exec_in_container(
        ["/usr/bin/clickhouse", "local", "--stacktrace", "-q", query]
    )

    assert out == "1\ta\n2\tbb\n3\tccc\n"

    query = f"""
    create table t (x UInt64, s String) engine = File('Distributed', '{path}/1.bin');
    select * from t;
    """
    out = node.exec_in_container(
        ["/usr/bin/clickhouse", "local", "--stacktrace", "-q", query]
    )

    assert out == "1\ta\n2\tbb\n3\tccc\n"

    node.query("drop table test.distr_1")


@cluster_param
def test_two_files(started_cluster, cluster):
    node.query(
        "create table test.distr_2 (x UInt64, s String) engine = Distributed('{}', database, table)".format(
            cluster
        )
    )
    node.query(
        "insert into test.distr_2 values (0, '_'), (1, 'a')",
        settings={
            "use_compact_format_in_distributed_parts_names": "1",
        },
    )
    node.query(
        "insert into test.distr_2 values (2, 'bb'), (3, 'ccc')",
        settings={
            "use_compact_format_in_distributed_parts_names": "1",
        },
    )

    path = get_dist_path(cluster, "distr_2", 1)
    query = f"select * from file('{path}/{{1,2,3,4}}.bin', 'Distributed') order by x"
    out = node.exec_in_container(
        ["/usr/bin/clickhouse", "local", "--stacktrace", "-q", query]
    )

    assert out == "0\t_\n1\ta\n2\tbb\n3\tccc\n"

    query = f"""
    create table t (x UInt64, s String) engine = File('Distributed', '{path}/{{1,2,3,4}}.bin');
    select * from t order by x;
    """
    out = node.exec_in_container(
        ["/usr/bin/clickhouse", "local", "--stacktrace", "-q", query]
    )

    assert out == "0\t_\n1\ta\n2\tbb\n3\tccc\n"

    node.query("drop table test.distr_2")


@cluster_param
def test_single_file_old(started_cluster, cluster):
    node.query(
        "create table test.distr_3 (x UInt64, s String) engine = Distributed('{}', database, table)".format(
            cluster
        )
    )
    node.query(
        "insert into test.distr_3 values (1, 'a'), (2, 'bb'), (3, 'ccc')",
        settings={
            "use_compact_format_in_distributed_parts_names": "0",
        },
    )

    path = get_dist_path(cluster, "distr_3", 0)
    query = f"select * from file('{path}/1.bin', 'Distributed')"
    out = node.exec_in_container(
        ["/usr/bin/clickhouse", "local", "--stacktrace", "-q", query]
    )

    assert out == "1\ta\n2\tbb\n3\tccc\n"

    query = f"""
    create table t (x UInt64, s String) engine = File('Distributed', '{path}/1.bin');
    select * from t;
    """
    out = node.exec_in_container(
        ["/usr/bin/clickhouse", "local", "--stacktrace", "-q", query]
    )

    assert out == "1\ta\n2\tbb\n3\tccc\n"

    node.query("drop table test.distr_3")


def test_remove_replica(started_cluster):
    node.query(
        "create table test.local_4 (x UInt64, s String) engine = MergeTree order by x"
    )
    node.query(
        "create table test.distr_4 (x UInt64, s String) engine = Distributed('test_cluster_remove_replica1', test, local_4)"
    )
    node.query(
        "insert into test.distr_4 values (1, 'a'), (2, 'bb'), (3, 'ccc'), (4, 'dddd')"
    )
    node.query("detach table test.distr_4")

    node.exec_in_container(
        [
            "sed",
            "-i",
            "s/test_cluster_remove_replica1/test_cluster_remove_replica_tmp/g",
            "/etc/clickhouse-server/config.d/another_remote_servers.xml",
        ]
    )
    node.exec_in_container(
        [
            "sed",
            "-i",
            "s/test_cluster_remove_replica2/test_cluster_remove_replica1/g",
            "/etc/clickhouse-server/config.d/another_remote_servers.xml",
        ]
    )
    node.query("SYSTEM RELOAD CONFIG")
    node.query("attach table test.distr_4", ignore_error=True)
    node.query("SYSTEM FLUSH DISTRIBUTED test.distr_4", ignore_error=True)
    assert node.query("select 1") == "1\n"
Drop replicas from dirname for internal_replication=true Under use_compact_format_in_distributed_parts_names=1 and internal_replication=true the server encodes all replicas for the directory name for async INSERT into Distributed, and the directory name looks like: shard1_replica1,shard1_replica2,shard3_replica3 This is required for creating connections (to specific replicas only), but in case of internal_replication=true, this can be avoided, since this path will always includes all replicas. This patch replaces all replicas with "_all_replicas" marker. Note, that initial problem was that this path may overflow the NAME_MAX if you will have more then 15 replicas, and the server will fail to create the directory. Also note, that changed directory name should not be a problem, since: - empty directories will be removed since #16729 - and replicas encoded in the directory name is also supported anyway. 2021-06-20 13:50:01 +00:00			`# pylint: disable=redefined-outer-name`
			`# pylint: disable=unused-argument`
			`# pylint: disable=line-too-long`
Added tests 2020-01-05 19:36:14 +00:00
Drop replicas from dirname for internal_replication=true Under use_compact_format_in_distributed_parts_names=1 and internal_replication=true the server encodes all replicas for the directory name for async INSERT into Distributed, and the directory name looks like: shard1_replica1,shard1_replica2,shard3_replica3 This is required for creating connections (to specific replicas only), but in case of internal_replication=true, this can be avoided, since this path will always includes all replicas. This patch replaces all replicas with "_all_replicas" marker. Note, that initial problem was that this path may overflow the NAME_MAX if you will have more then 15 replicas, and the server will fail to create the directory. Also note, that changed directory name should not be a problem, since: - empty directories will be removed since #16729 - and replicas encoded in the directory name is also supported anyway. 2021-06-20 13:50:01 +00:00			`import pytest`
Automatic style fix 2024-09-27 10:19:39 +00:00
Added tests 2020-01-05 19:36:14 +00:00			`from helpers.cluster import ClickHouseCluster`

			`cluster = ClickHouseCluster(__file__)`
Black 2022-06-13 19:44:12 +00:00			`node = cluster.add_instance(`
			`"node",`
			`main_configs=["configs/remote_servers.xml", "configs/another_remote_servers.xml"],`
			`stay_alive=True,`
			`)`
Added tests 2020-01-05 19:36:14 +00:00
Update test_distributed_format to cover remote_servers via <node> 2020-06-08 19:15:30 +00:00			`cluster_param = pytest.mark.parametrize(`
			`"cluster",`
			`[`
Drop replicas from dirname for internal_replication=true Under use_compact_format_in_distributed_parts_names=1 and internal_replication=true the server encodes all replicas for the directory name for async INSERT into Distributed, and the directory name looks like: shard1_replica1,shard1_replica2,shard3_replica3 This is required for creating connections (to specific replicas only), but in case of internal_replication=true, this can be avoided, since this path will always includes all replicas. This patch replaces all replicas with "_all_replicas" marker. Note, that initial problem was that this path may overflow the NAME_MAX if you will have more then 15 replicas, and the server will fail to create the directory. Also note, that changed directory name should not be a problem, since: - empty directories will be removed since #16729 - and replicas encoded in the directory name is also supported anyway. 2021-06-20 13:50:01 +00:00			`("test_cluster_internal_replication"),`
			`("test_cluster_no_internal_replication"),`
Update test_distributed_format to cover remote_servers via <node> 2020-06-08 19:15:30 +00:00			`],`
			`)`

Added tests 2020-01-05 19:36:14 +00:00
Drop replicas from dirname for internal_replication=true Under use_compact_format_in_distributed_parts_names=1 and internal_replication=true the server encodes all replicas for the directory name for async INSERT into Distributed, and the directory name looks like: shard1_replica1,shard1_replica2,shard3_replica3 This is required for creating connections (to specific replicas only), but in case of internal_replication=true, this can be avoided, since this path will always includes all replicas. This patch replaces all replicas with "_all_replicas" marker. Note, that initial problem was that this path may overflow the NAME_MAX if you will have more then 15 replicas, and the server will fail to create the directory. Also note, that changed directory name should not be a problem, since: - empty directories will be removed since #16729 - and replicas encoded in the directory name is also supported anyway. 2021-06-20 13:50:01 +00:00			`def get_dist_path(cluster, table, dist_format):`
			`if dist_format == 0:`
			`return f"/var/lib/clickhouse/data/test/{table}/default@not_existing:9000"`
			`if cluster == "test_cluster_internal_replication":`
			`return f"/var/lib/clickhouse/data/test/{table}/shard1_all_replicas"`
			`return f"/var/lib/clickhouse/data/test/{table}/shard1_replica1"`


Added tests 2020-01-05 19:36:14 +00:00			`@pytest.fixture(scope="module")`
			`def started_cluster():`
			`try:`
			`cluster.start()`
enable more tests 2020-09-22 11:56:40 +00:00			`node.query("create database test")`
Added tests 2020-01-05 19:36:14 +00:00			`yield cluster`

			`finally:`
			`cluster.shutdown()`

Format and optimize imports in integration test files This PR formats all the `*.py` files found under the `tests/integration` folder. It also reorders the imports and cleans up a bunch of unused imports. The formatting also takes care of other things like wrapping lines and fixing spaces and indents such that the tests look more readable. 2020-09-16 04:26:10 +00:00
Update test_distributed_format to cover remote_servers via <node> 2020-06-08 19:15:30 +00:00			`@cluster_param`
			`def test_single_file(started_cluster, cluster):`
Format and optimize imports in integration test files This PR formats all the `*.py` files found under the `tests/integration` folder. It also reorders the imports and cleans up a bunch of unused imports. The formatting also takes care of other things like wrapping lines and fixing spaces and indents such that the tests look more readable. 2020-09-16 04:26:10 +00:00			`node.query(`
			`"create table test.distr_1 (x UInt64, s String) engine = Distributed('{}', database, table)".format(`
			`cluster`
			`)`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`)`
Format and optimize imports in integration test files This PR formats all the `*.py` files found under the `tests/integration` folder. It also reorders the imports and cleans up a bunch of unused imports. The formatting also takes care of other things like wrapping lines and fixing spaces and indents such that the tests look more readable. 2020-09-16 04:26:10 +00:00			`node.query(`
			`"insert into test.distr_1 values (1, 'a'), (2, 'bb'), (3, 'ccc')",`
			`settings={"use_compact_format_in_distributed_parts_names": "1"},`
			`)`
Added tests 2020-01-05 19:36:14 +00:00
Drop replicas from dirname for internal_replication=true Under use_compact_format_in_distributed_parts_names=1 and internal_replication=true the server encodes all replicas for the directory name for async INSERT into Distributed, and the directory name looks like: shard1_replica1,shard1_replica2,shard3_replica3 This is required for creating connections (to specific replicas only), but in case of internal_replication=true, this can be avoided, since this path will always includes all replicas. This patch replaces all replicas with "_all_replicas" marker. Note, that initial problem was that this path may overflow the NAME_MAX if you will have more then 15 replicas, and the server will fail to create the directory. Also note, that changed directory name should not be a problem, since: - empty directories will be removed since #16729 - and replicas encoded in the directory name is also supported anyway. 2021-06-20 13:50:01 +00:00			`path = get_dist_path(cluster, "distr_1", 1)`
			`query = f"select * from file('{path}/1.bin', 'Distributed')"`
Added tests 2020-01-05 19:36:14 +00:00			`out = node.exec_in_container(`
Remove obsolete --multiquery parameter from tests 2024-07-29 20:06:55 +00:00			`["/usr/bin/clickhouse", "local", "--stacktrace", "-q", query]`
Added tests 2020-01-05 19:36:14 +00:00			`)`

			`assert out == "1\ta\n2\tbb\n3\tccc\n"`

Drop replicas from dirname for internal_replication=true Under use_compact_format_in_distributed_parts_names=1 and internal_replication=true the server encodes all replicas for the directory name for async INSERT into Distributed, and the directory name looks like: shard1_replica1,shard1_replica2,shard3_replica3 This is required for creating connections (to specific replicas only), but in case of internal_replication=true, this can be avoided, since this path will always includes all replicas. This patch replaces all replicas with "_all_replicas" marker. Note, that initial problem was that this path may overflow the NAME_MAX if you will have more then 15 replicas, and the server will fail to create the directory. Also note, that changed directory name should not be a problem, since: - empty directories will be removed since #16729 - and replicas encoded in the directory name is also supported anyway. 2021-06-20 13:50:01 +00:00			`query = f"""`
			`create table t (x UInt64, s String) engine = File('Distributed', '{path}/1.bin');`
			`select * from t;`
			`"""`
Added tests 2020-01-05 19:36:14 +00:00			`out = node.exec_in_container(`
Remove obsolete --multiquery parameter from tests 2024-07-29 20:06:55 +00:00			`["/usr/bin/clickhouse", "local", "--stacktrace", "-q", query]`
Added tests 2020-01-05 19:36:14 +00:00			`)`

			`assert out == "1\ta\n2\tbb\n3\tccc\n"`

fix some tests 2020-01-28 19:39:52 +00:00			`node.query("drop table test.distr_1")`
Added tests 2020-01-05 19:36:14 +00:00

Update test_distributed_format to cover remote_servers via <node> 2020-06-08 19:15:30 +00:00			`@cluster_param`
			`def test_two_files(started_cluster, cluster):`
Format and optimize imports in integration test files This PR formats all the `*.py` files found under the `tests/integration` folder. It also reorders the imports and cleans up a bunch of unused imports. The formatting also takes care of other things like wrapping lines and fixing spaces and indents such that the tests look more readable. 2020-09-16 04:26:10 +00:00			`node.query(`
			`"create table test.distr_2 (x UInt64, s String) engine = Distributed('{}', database, table)".format(`
			`cluster`
			`)`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`)`
Set use_compact_format_in_distributed_parts_names=0 for some integration tests To avoid depends from default. 2020-11-06 05:34:37 +00:00			`node.query(`
			`"insert into test.distr_2 values (0, '_'), (1, 'a')",`
			`settings={`
			`"use_compact_format_in_distributed_parts_names": "1",`
			`},`
			`)`
			`node.query(`
			`"insert into test.distr_2 values (2, 'bb'), (3, 'ccc')",`
			`settings={`
			`"use_compact_format_in_distributed_parts_names": "1",`
			`},`
			`)`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00
Drop replicas from dirname for internal_replication=true Under use_compact_format_in_distributed_parts_names=1 and internal_replication=true the server encodes all replicas for the directory name for async INSERT into Distributed, and the directory name looks like: shard1_replica1,shard1_replica2,shard3_replica3 This is required for creating connections (to specific replicas only), but in case of internal_replication=true, this can be avoided, since this path will always includes all replicas. This patch replaces all replicas with "_all_replicas" marker. Note, that initial problem was that this path may overflow the NAME_MAX if you will have more then 15 replicas, and the server will fail to create the directory. Also note, that changed directory name should not be a problem, since: - empty directories will be removed since #16729 - and replicas encoded in the directory name is also supported anyway. 2021-06-20 13:50:01 +00:00			`path = get_dist_path(cluster, "distr_2", 1)`
			`query = f"select * from file('{path}/{{1,2,3,4}}.bin', 'Distributed') order by x"`
Added tests 2020-01-05 19:36:14 +00:00			`out = node.exec_in_container(`
			`["/usr/bin/clickhouse", "local", "--stacktrace", "-q", query]`
			`)`

			`assert out == "0\t_\n1\ta\n2\tbb\n3\tccc\n"`

Drop replicas from dirname for internal_replication=true Under use_compact_format_in_distributed_parts_names=1 and internal_replication=true the server encodes all replicas for the directory name for async INSERT into Distributed, and the directory name looks like: shard1_replica1,shard1_replica2,shard3_replica3 This is required for creating connections (to specific replicas only), but in case of internal_replication=true, this can be avoided, since this path will always includes all replicas. This patch replaces all replicas with "_all_replicas" marker. Note, that initial problem was that this path may overflow the NAME_MAX if you will have more then 15 replicas, and the server will fail to create the directory. Also note, that changed directory name should not be a problem, since: - empty directories will be removed since #16729 - and replicas encoded in the directory name is also supported anyway. 2021-06-20 13:50:01 +00:00			`query = f"""`
			`create table t (x UInt64, s String) engine = File('Distributed', '{path}/{{1,2,3,4}}.bin');`
			`select * from t order by x;`
			`"""`
Added tests 2020-01-05 19:36:14 +00:00			`out = node.exec_in_container(`
Remove obsolete --multiquery parameter from tests 2024-07-29 20:06:55 +00:00			`["/usr/bin/clickhouse", "local", "--stacktrace", "-q", query]`
Added tests 2020-01-05 19:36:14 +00:00			`)`

			`assert out == "0\t_\n1\ta\n2\tbb\n3\tccc\n"`

fix some tests 2020-01-28 19:39:52 +00:00			`node.query("drop table test.distr_2")`
Add setting for a new format of distributed parts 2020-03-13 18:49:46 +00:00

Update test_distributed_format to cover remote_servers via <node> 2020-06-08 19:15:30 +00:00			`@cluster_param`
			`def test_single_file_old(started_cluster, cluster):`
Format and optimize imports in integration test files This PR formats all the `*.py` files found under the `tests/integration` folder. It also reorders the imports and cleans up a bunch of unused imports. The formatting also takes care of other things like wrapping lines and fixing spaces and indents such that the tests look more readable. 2020-09-16 04:26:10 +00:00			`node.query(`
			`"create table test.distr_3 (x UInt64, s String) engine = Distributed('{}', database, table)".format(`
			`cluster`
			`)`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`)`
Set use_compact_format_in_distributed_parts_names=0 for some integration tests To avoid depends from default. 2020-11-06 05:34:37 +00:00			`node.query(`
			`"insert into test.distr_3 values (1, 'a'), (2, 'bb'), (3, 'ccc')",`
			`settings={`
			`"use_compact_format_in_distributed_parts_names": "0",`
			`},`
			`)`
Add setting for a new format of distributed parts 2020-03-13 18:49:46 +00:00
Drop replicas from dirname for internal_replication=true Under use_compact_format_in_distributed_parts_names=1 and internal_replication=true the server encodes all replicas for the directory name for async INSERT into Distributed, and the directory name looks like: shard1_replica1,shard1_replica2,shard3_replica3 This is required for creating connections (to specific replicas only), but in case of internal_replication=true, this can be avoided, since this path will always includes all replicas. This patch replaces all replicas with "_all_replicas" marker. Note, that initial problem was that this path may overflow the NAME_MAX if you will have more then 15 replicas, and the server will fail to create the directory. Also note, that changed directory name should not be a problem, since: - empty directories will be removed since #16729 - and replicas encoded in the directory name is also supported anyway. 2021-06-20 13:50:01 +00:00			`path = get_dist_path(cluster, "distr_3", 0)`
			`query = f"select * from file('{path}/1.bin', 'Distributed')"`
Add setting for a new format of distributed parts 2020-03-13 18:49:46 +00:00			`out = node.exec_in_container(`
			`["/usr/bin/clickhouse", "local", "--stacktrace", "-q", query]`
			`)`

			`assert out == "1\ta\n2\tbb\n3\tccc\n"`

Drop replicas from dirname for internal_replication=true Under use_compact_format_in_distributed_parts_names=1 and internal_replication=true the server encodes all replicas for the directory name for async INSERT into Distributed, and the directory name looks like: shard1_replica1,shard1_replica2,shard3_replica3 This is required for creating connections (to specific replicas only), but in case of internal_replication=true, this can be avoided, since this path will always includes all replicas. This patch replaces all replicas with "_all_replicas" marker. Note, that initial problem was that this path may overflow the NAME_MAX if you will have more then 15 replicas, and the server will fail to create the directory. Also note, that changed directory name should not be a problem, since: - empty directories will be removed since #16729 - and replicas encoded in the directory name is also supported anyway. 2021-06-20 13:50:01 +00:00			`query = f"""`
			`create table t (x UInt64, s String) engine = File('Distributed', '{path}/1.bin');`
			`select * from t;`
			`"""`
Add setting for a new format of distributed parts 2020-03-13 18:49:46 +00:00			`out = node.exec_in_container(`
Remove obsolete --multiquery parameter from tests 2024-07-29 20:06:55 +00:00			`["/usr/bin/clickhouse", "local", "--stacktrace", "-q", query]`
Add setting for a new format of distributed parts 2020-03-13 18:49:46 +00:00			`)`

			`assert out == "1\ta\n2\tbb\n3\tccc\n"`

fixes 2020-03-18 17:38:52 +00:00			`node.query("drop table test.distr_3")`
Fix possible crash in Distributed async insert in case of removing a replica from config. 2022-06-13 15:21:43 +00:00
Black 2022-06-13 19:44:12 +00:00
Fix possible crash in Distributed async insert in case of removing a replica from config. 2022-06-13 15:21:43 +00:00			`def test_remove_replica(started_cluster):`
			`node.query(`
			`"create table test.local_4 (x UInt64, s String) engine = MergeTree order by x"`
			`)`
			`node.query(`
			`"create table test.distr_4 (x UInt64, s String) engine = Distributed('test_cluster_remove_replica1', test, local_4)"`
			`)`
Black 2022-06-13 19:44:12 +00:00			`node.query(`
			`"insert into test.distr_4 values (1, 'a'), (2, 'bb'), (3, 'ccc'), (4, 'dddd')"`
			`)`
Fix possible crash in Distributed async insert in case of removing a replica from config. 2022-06-13 15:21:43 +00:00			`node.query("detach table test.distr_4")`

Black 2022-06-13 19:44:12 +00:00			`node.exec_in_container(`
			`[`
			`"sed",`
			`"-i",`
			`"s/test_cluster_remove_replica1/test_cluster_remove_replica_tmp/g",`
			`"/etc/clickhouse-server/config.d/another_remote_servers.xml",`
			`]`
			`)`
			`node.exec_in_container(`
			`[`
			`"sed",`
			`"-i",`
			`"s/test_cluster_remove_replica2/test_cluster_remove_replica1/g",`
			`"/etc/clickhouse-server/config.d/another_remote_servers.xml",`
			`]`
			`)`
Fix possible crash in Distributed async insert in case of removing a replica from config. 2022-06-13 15:21:43 +00:00			`node.query("SYSTEM RELOAD CONFIG")`
			`node.query("attach table test.distr_4", ignore_error=True)`
			`node.query("SYSTEM FLUSH DISTRIBUTED test.distr_4", ignore_error=True)`
Black 2022-06-13 19:44:12 +00:00			`assert node.query("select 1") == "1\n"`