Add a test to check max_rows_to_read_leaf behaviour

In ClickHouse v22.3, querying a distributed table through a view results in a max_rows_to_read_leaf limit reached exception being raised on the root executor despite the row limit for each leaf node not being reached. This is fixed in ClickHouse v23.3, but Adding this integration test to catch any issues in the future
2024-11-22 15:42:02 +00:00 · 2023-04-20 08:32:02 +00:00 · 2023-04-20 08:32:02 +00:00 · 1d51bc90f0
commit 1d51bc90f0
parent ee9fae6aa2
3 changed files with 94 additions and 0 deletions
--- a/tests/integration/test_max_rows_to_read_leaf_with_view/init.py
+++ b/tests/integration/test_max_rows_to_read_leaf_with_view/init.py
--- a/tests/integration/test_max_rows_to_read_leaf_with_view/configs/remote_servers.xml
+++ b/tests/integration/test_max_rows_to_read_leaf_with_view/configs/remote_servers.xml
@ -0,0 +1,18 @@
 <clickhouse>
    <remote_servers>
        <two_shards>
            <shard>
                <replica>
                    <host>node1</host>
                    <port>9000</port>
                </replica>
            </shard>
            <shard>
                <replica>
                    <host>node2</host>
                    <port>9000</port>
                </replica>
            </shard>
        </two_shards>
    </remote_servers>
 </clickhouse>
--- a/tests/integration/test_max_rows_to_read_leaf_with_view/test.py
+++ b/tests/integration/test_max_rows_to_read_leaf_with_view/test.py
@ -0,0 +1,76 @@
 from contextlib import contextmanager
 import pytest
 from helpers.cluster import ClickHouseCluster
 from helpers.client import QueryRuntimeException
 cluster = ClickHouseCluster(__file__)
 node1 = cluster.add_instance(
    "node1",
    main_configs=["configs/remote_servers.xml"],
    with_zookeeper=True,
 )
 node2 = cluster.add_instance(
    "node2",
    main_configs=["configs/remote_servers.xml"],
    with_zookeeper=True,
 )
@pytest.fixture(scope="module")
 def started_cluster():
    try:
        cluster.start()
        for node in (node1, node2):
            node.query(
                f"""
                CREATE TABLE local_table(id UInt32, d DateTime) ENGINE = ReplicatedMergeTree('/clickhouse/tables/0/max_rows_read_leaf', '{node}') PARTITION BY toYYYYMM(d) ORDER BY d;
                CREATE TABLE distributed_table(id UInt32, d DateTime) ENGINE = Distributed(two_shards, default, local_table);
                CREATE OR REPLACE VIEW test_view AS select id from distributed_table;
 """
            )
        node1.query(
            "INSERT INTO local_table (id) select * from system.numbers limit 200"
        )
        node2.query(
            "INSERT INTO local_table (id) select * from system.numbers limit 200"
        )
        yield cluster
    finally:
        cluster.shutdown()
 def test_max_rows_to_read_leaf_via_view(started_cluster):
    """
    Asserts the expected behaviour that we should be able to select
    the total amount of rows (400 -  200 from each shard) from a
    view that selects from a distributed table.
    """
    assert (
        node1.query(
            "SELECT count() from test_view SETTINGS max_rows_to_read_leaf=200"
        ).rstrip()
        == "400"
    )
    with pytest.raises(
        QueryRuntimeException, match="controlled by 'max_rows_to_read_leaf'"
    ):
        # insert some more data and ensure we get a legitimate failure
        node2.query(
            "INSERT INTO local_table (id) select * from system.numbers limit 10"
        )
        node2.query("SELECT count() from test_view SETTINGS max_rows_to_read_leaf=200")
 if __name__ == "__main__":
    with contextmanager(started_cluster)() as cluster:
        for name, instance in list(cluster.instances.items()):
            print(name, instance.ip_address)
        input("Cluster created, press any key to destroy...")