ClickHouse/tests/integration/test_dns_cache/test.py

import pytest
from helpers.client import QueryRuntimeException
from helpers.cluster import ClickHouseCluster
from helpers.test_tools import TSV
from helpers.test_tools import assert_eq_with_retry

cluster = ClickHouseCluster(__file__)


def _fill_nodes(nodes, table_name):
    for node in nodes:
        node.query(
            """
            CREATE DATABASE IF NOT EXISTS test;
            CREATE TABLE IF NOT EXISTS {0}(date Date, id UInt32)
            ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/{0}', '{1}')
            ORDER BY id PARTITION BY toYYYYMM(date);
            """.format(
                table_name, node.name
            )
        )


node1 = cluster.add_instance(
    "node1",
    main_configs=["configs/listen_host.xml"],
    with_zookeeper=True,
    ipv6_address="2001:3984:3989::1:1111",
)
node2 = cluster.add_instance(
    "node2",
    main_configs=["configs/listen_host.xml", "configs/dns_update_long.xml"],
    with_zookeeper=True,
    ipv6_address="2001:3984:3989::1:1112",
)


@pytest.fixture(scope="module")
def cluster_without_dns_cache_update():
    try:
        cluster.start()

        _fill_nodes([node1, node2], "test_table_drop")

        yield cluster

    except Exception as ex:
        print(ex)

    finally:
        cluster.shutdown()
        pass


# node1 is a source, node2 downloads data
# node2 has long dns_cache_update_period, so dns cache update wouldn't work
def test_ip_change_drop_dns_cache(cluster_without_dns_cache_update):
    # First we check, that normal replication works
    node1.query(
        "INSERT INTO test_table_drop VALUES ('2018-10-01', 1), ('2018-10-02', 2), ('2018-10-03', 3)"
    )
    assert node1.query("SELECT count(*) from test_table_drop") == "3\n"
    assert_eq_with_retry(node2, "SELECT count(*) from test_table_drop", "3")

    # We change source node ip
    cluster.restart_instance_with_ip_change(node1, "2001:3984:3989::1:7777")

    # Put some data to source node1
    node1.query(
        "INSERT INTO test_table_drop VALUES ('2018-10-01', 5), ('2018-10-02', 6), ('2018-10-03', 7)"
    )
    # Check that data is placed on node1
    assert node1.query("SELECT count(*) from test_table_drop") == "6\n"

    # Because of DNS cache dest node2 cannot download data from node1
    with pytest.raises(Exception):
        assert_eq_with_retry(node2, "SELECT count(*) from test_table_drop", "6")

    # drop DNS cache
    node2.query("SYSTEM DROP DNS CACHE")
    # Data is downloaded
    assert_eq_with_retry(node2, "SELECT count(*) from test_table_drop", "6")

    # Just to be sure check one more time
    node1.query("INSERT INTO test_table_drop VALUES ('2018-10-01', 8)")
    assert node1.query("SELECT count(*) from test_table_drop") == "7\n"
    assert_eq_with_retry(node2, "SELECT count(*) from test_table_drop", "7")


node3 = cluster.add_instance(
    "node3",
    main_configs=["configs/listen_host.xml"],
    with_zookeeper=True,
    ipv6_address="2001:3984:3989::1:1113",
)
node4 = cluster.add_instance(
    "node4",
    main_configs=[
        "configs/remote_servers.xml",
        "configs/listen_host.xml",
        "configs/dns_update_short.xml",
    ],
    with_zookeeper=True,
    ipv6_address="2001:3984:3989::1:1114",
)


@pytest.fixture(scope="module")
def cluster_with_dns_cache_update():
    try:
        cluster.start()

        _fill_nodes([node3, node4], "test_table_update")

        yield cluster

    except Exception as ex:
        print(ex)

    finally:
        cluster.shutdown()
        pass


# node3 is a source, node4 downloads data
# node4 has short dns_cache_update_period, so testing update of dns cache
def test_ip_change_update_dns_cache(cluster_with_dns_cache_update):
    # First we check, that normal replication works
    node3.query(
        "INSERT INTO test_table_update VALUES ('2018-10-01', 1), ('2018-10-02', 2), ('2018-10-03', 3)"
    )
    assert node3.query("SELECT count(*) from test_table_update") == "3\n"
    assert_eq_with_retry(node4, "SELECT count(*) from test_table_update", "3")

    # We change source node ip
    cluster.restart_instance_with_ip_change(node3, "2001:3984:3989::1:8888")

    # Put some data to source node3
    node3.query(
        "INSERT INTO test_table_update VALUES ('2018-10-01', 5), ('2018-10-02', 6), ('2018-10-03', 7)"
    )

    # Check that data is placed on node3
    assert node3.query("SELECT count(*) from test_table_update") == "6\n"

    curl_result = node4.exec_in_container(["bash", "-c", "curl -s 'node3:8123'"])
    assert curl_result == "Ok.\n"
    cat_resolv = node4.exec_in_container(["bash", "-c", "cat /etc/resolv.conf"])
    print(("RESOLV {}".format(cat_resolv)))

    assert_eq_with_retry(
        node4, "SELECT * FROM remote('node3', 'system', 'one')", "0", sleep_time=0.5
    )

    # Because of DNS cache update, ip of node3 would be updated
    assert_eq_with_retry(
        node4, "SELECT count(*) from test_table_update", "6", sleep_time=3
    )

    # Just to be sure check one more time
    node3.query("INSERT INTO test_table_update VALUES ('2018-10-01', 8)")
    assert node3.query("SELECT count(*) from test_table_update") == "7\n"
    assert_eq_with_retry(node4, "SELECT count(*) from test_table_update", "7")


def set_hosts(node, hosts):
    new_content = "\\n".join(["127.0.0.1 localhost", "::1 localhost"] + hosts)
    node.exec_in_container(
        ["bash", "-c", 'echo -e "{}" > /etc/hosts'.format(new_content)],
        privileged=True,
        user="root",
    )


def test_dns_cache_update(cluster_with_dns_cache_update):
    set_hosts(node4, ["127.255.255.255 lost_host"])

    with pytest.raises(QueryRuntimeException):
        node4.query("SELECT * FROM remote('lost_host', 'system', 'one')")

    node4.query(
        "CREATE TABLE distributed_lost_host (dummy UInt8) ENGINE = Distributed(lost_host_cluster, 'system', 'one')"
    )
    with pytest.raises(QueryRuntimeException):
        node4.query("SELECT * FROM distributed_lost_host")

    set_hosts(node4, ["127.0.0.1 lost_host"])

    # Wait a bit until dns cache will be updated
    assert_eq_with_retry(
        node4, "SELECT * FROM remote('lost_host', 'system', 'one')", "0"
    )
    assert_eq_with_retry(node4, "SELECT * FROM distributed_lost_host", "0")

    assert TSV(
        node4.query(
            "SELECT DISTINCT host_name, host_address FROM system.clusters WHERE cluster='lost_host_cluster'"
        )
    ) == TSV("lost_host\t127.0.0.1\n")
    assert TSV(node4.query("SELECT hostName()")) == TSV("node4")


# Check SYSTEM DROP DNS CACHE on node5 and background cache update on node6
node5 = cluster.add_instance(
    "node5",
    main_configs=["configs/listen_host.xml", "configs/dns_update_long.xml"],
    user_configs=["configs/users_with_hostname.xml"],
    ipv6_address="2001:3984:3989::1:1115",
)
node6 = cluster.add_instance(
    "node6",
    main_configs=["configs/listen_host.xml", "configs/dns_update_short.xml"],
    user_configs=["configs/users_with_hostname.xml"],
    ipv6_address="2001:3984:3989::1:1116",
)


@pytest.mark.parametrize("node", [node5, node6])
def test_user_access_ip_change(cluster_with_dns_cache_update, node):
    node_name = node.name
    node_num = node.name[-1]
    # getaddrinfo(...) may hang for a log time without this options
    node.exec_in_container(
        [
            "bash",
            "-c",
            'echo -e "options timeout:1\noptions attempts:2" >> /etc/resolv.conf',
        ],
        privileged=True,
        user="root",
    )

    assert (
        node3.query("SELECT * FROM remote('{}', 'system', 'one')".format(node_name))
        == "0\n"
    )
    assert (
        node4.query("SELECT * FROM remote('{}', 'system', 'one')".format(node_name))
        == "0\n"
    )

    set_hosts(
        node,
        [
            "127.255.255.255 node3",
            "2001:3984:3989::1:88{}4 unknown_host".format(node_num),
        ],
    )

    cluster.restart_instance_with_ip_change(
        node3, "2001:3984:3989::1:88{}3".format(node_num)
    )
    cluster.restart_instance_with_ip_change(
        node4, "2001:3984:3989::1:88{}4".format(node_num)
    )

    with pytest.raises(QueryRuntimeException):
        node3.query("SELECT * FROM remote('{}', 'system', 'one')".format(node_name))
    with pytest.raises(QueryRuntimeException):
        node4.query("SELECT * FROM remote('{}', 'system', 'one')".format(node_name))
    # now wrong addresses are cached

    set_hosts(node, [])
    retry_count = 60
    if node_name == "node5":
        # client is not allowed to connect, so execute it directly in container to send query from localhost
        node.exec_in_container(
            ["bash", "-c", 'clickhouse client -q "SYSTEM DROP DNS CACHE"'],
            privileged=True,
            user="root",
        )
        retry_count = 1

    assert_eq_with_retry(
        node3,
        "SELECT * FROM remote('{}', 'system', 'one')".format(node_name),
        "0",
        retry_count=retry_count,
        sleep_time=1,
    )
    assert_eq_with_retry(
        node4,
        "SELECT * FROM remote('{}', 'system', 'one')".format(node_name),
        "0",
        retry_count=retry_count,
        sleep_time=1,
    )


def test_host_is_drop_from_cache_after_consecutive_failures(
    cluster_with_dns_cache_update,
):
    with pytest.raises(QueryRuntimeException):
        node4.query(
            "SELECT * FROM remote('InvalidHostThatDoesNotExist', 'system', 'one')"
        )

    # Note that the list of hosts in variable since lost_host will be there too (and it's dropped and added back)
    # dns_update_short -> dns_max_consecutive_failures set to 6
    assert node4.wait_for_log_line(
        "Cannot resolve host \\(InvalidHostThatDoesNotExist\\), error 0: Host not found."
    )
    assert node4.wait_for_log_line(
        "Cached hosts not found:.*InvalidHostThatDoesNotExist**",
        repetitions=6,
        timeout=60,
        look_behind_lines=500,
    )
    assert node4.wait_for_log_line(
        "Cached hosts dropped:.*InvalidHostThatDoesNotExist.*"
    )