ClickHouse/tests/integration/test_replicated_fetches_timeouts/test.py

#!/usr/bin/env python3

import random
import string
import time

import pytest
from helpers.cluster import ClickHouseCluster
from helpers.network import PartitionManager

cluster = ClickHouseCluster(__file__)
node1 = cluster.add_instance(
    'node1', with_zookeeper=True,
    main_configs=['configs/server.xml'])

node2 = cluster.add_instance(
    'node2', with_zookeeper=True,
    main_configs=['configs/server.xml'])


@pytest.fixture(scope="module")
def started_cluster():
    try:
        cluster.start()

        yield cluster

    finally:
        cluster.shutdown()


def get_random_string(length):
    return ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(length))


def test_no_stall(started_cluster):
    for instance in started_cluster.instances.values():
        instance.query("""
            CREATE TABLE t (key UInt64, data String)
            ENGINE = ReplicatedMergeTree('/clickhouse/test/t', '{instance}')
                ORDER BY tuple()
                PARTITION BY key""")

    # Pause node3 until the test setup is prepared
    node2.query("SYSTEM STOP FETCHES t")

    node1.query("INSERT INTO t SELECT 1, '{}' FROM numbers(500)".format(get_random_string(104857)))
    node1.query("INSERT INTO t SELECT 2, '{}' FROM numbers(500)".format(get_random_string(104857)))

    with PartitionManager() as pm:
        pm.add_network_delay(node1, 2000)
        node2.query("SYSTEM START FETCHES t")

        # Wait for timeout exceptions to confirm that timeout is triggered.
        while True:
            conn_timeout_exceptions = int(node2.query(
                """
                SELECT count()
                FROM system.replication_queue
                WHERE last_exception LIKE '%connect timed out%'
                """))

            if conn_timeout_exceptions >= 2:
                break

            time.sleep(0.1)

        print("Connection timeouts tested!")

        # Increase connection timeout and wait for receive timeouts.
        node2.query("""
            ALTER TABLE t
                MODIFY SETTING replicated_fetches_http_connection_timeout = 30,
                    replicated_fetches_http_receive_timeout = 1""")

        while True:
            timeout_exceptions = int(node2.query(
                """
                SELECT count()
                FROM system.replication_queue
                WHERE last_exception LIKE '%e.displayText() = Timeout%'
                    AND last_exception NOT LIKE '%connect timed out%'
                """).strip())

            if timeout_exceptions >= 2:
                break

            time.sleep(0.1)

    for instance in started_cluster.instances.values():
        # Workaround for DROP TABLE not finishing if it is started while table is readonly.
        instance.query("SYSTEM RESTART REPLICA t")

        # Cleanup data directory from test results archive.
        instance.query("DROP TABLE t SYNC")
Improve replicated fetches timeouts test and make it 3x faster 2021-02-08 21:19:32 +00:00			`#!/usr/bin/env python3`

			`import random`
			`import string`
			`import time`

			`import pytest`
			`from helpers.cluster import ClickHouseCluster`
			`from helpers.network import PartitionManager`

			`cluster = ClickHouseCluster(__file__)`
			`node1 = cluster.add_instance(`
			`'node1', with_zookeeper=True,`
			`main_configs=['configs/server.xml'])`

			`node2 = cluster.add_instance(`
			`'node2', with_zookeeper=True,`
			`main_configs=['configs/server.xml'])`


			`@pytest.fixture(scope="module")`
			`def started_cluster():`
			`try:`
			`cluster.start()`

			`yield cluster`

			`finally:`
			`cluster.shutdown()`


			`def get_random_string(length):`
			`return ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(length))`


			`def test_no_stall(started_cluster):`
			`for instance in started_cluster.instances.values():`
			`instance.query("""`
			`CREATE TABLE t (key UInt64, data String)`
			`ENGINE = ReplicatedMergeTree('/clickhouse/test/t', '{instance}')`
			`ORDER BY tuple()`
			`PARTITION BY key""")`

			`# Pause node3 until the test setup is prepared`
			`node2.query("SYSTEM STOP FETCHES t")`

			`node1.query("INSERT INTO t SELECT 1, '{}' FROM numbers(500)".format(get_random_string(104857)))`
			`node1.query("INSERT INTO t SELECT 2, '{}' FROM numbers(500)".format(get_random_string(104857)))`

			`with PartitionManager() as pm:`
			`pm.add_network_delay(node1, 2000)`
			`node2.query("SYSTEM START FETCHES t")`

			`# Wait for timeout exceptions to confirm that timeout is triggered.`
			`while True:`
			`conn_timeout_exceptions = int(node2.query(`
			`"""`
			`SELECT count()`
			`FROM system.replication_queue`
			`WHERE last_exception LIKE '%connect timed out%'`
			`"""))`

			`if conn_timeout_exceptions >= 2:`
			`break`

			`time.sleep(0.1)`

			`print("Connection timeouts tested!")`

			`# Increase connection timeout and wait for receive timeouts.`
			`node2.query("""`
			`ALTER TABLE t`
			`MODIFY SETTING replicated_fetches_http_connection_timeout = 30,`
			`replicated_fetches_http_receive_timeout = 1""")`

			`while True:`
			`timeout_exceptions = int(node2.query(`
			`"""`
			`SELECT count()`
			`FROM system.replication_queue`
			`WHERE last_exception LIKE '%e.displayText() = Timeout%'`
			`AND last_exception NOT LIKE '%connect timed out%'`
			`""").strip())`

			`if timeout_exceptions >= 2:`
			`break`

			`time.sleep(0.1)`

			`for instance in started_cluster.instances.values():`
			`# Workaround for DROP TABLE not finishing if it is started while table is readonly.`
			`instance.query("SYSTEM RESTART REPLICA t")`

			`# Cleanup data directory from test results archive.`
			`instance.query("DROP TABLE t SYNC")`