Split tests to make them stable

This commit is contained in:
alesapin 2021-02-12 11:50:20 +03:00
parent df181b534e
commit c883b7d154
14 changed files with 408 additions and 172 deletions

View File

@ -27,6 +27,30 @@ def started_cluster():
def smaller_exception(ex):
return '\n'.join(str(ex).split('\n')[0:2])
def wait_node(node):
for _ in range(100):
zk = None
try:
node.query("SELECT * FROM system.zookeeper WHERE path = '/'")
zk = get_fake_zk(node.name, timeout=30.0)
zk.create("/test", sequence=True)
print("node", node.name, "ready")
break
except Exception as ex:
time.sleep(0.2)
print("Waiting until", node.name, "will be ready, exception", ex)
finally:
if zk:
zk.stop()
zk.close()
else:
raise Exception("Can't wait node", node.name, "to become ready")
def wait_nodes():
for node in [node1, node2, node3]:
wait_node(node)
def get_fake_zk(nodename, timeout=30.0):
_fake_zk_instance = KazooClient(hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout)
def reset_listener(state):
@ -39,182 +63,11 @@ def get_fake_zk(nodename, timeout=30.0):
_fake_zk_instance.start()
return _fake_zk_instance
def test_read_write_multinode(started_cluster):
try:
node1_zk = get_fake_zk("node1")
node2_zk = get_fake_zk("node2")
node3_zk = get_fake_zk("node3")
node1_zk.create("/test_read_write_multinode_node1", b"somedata1")
node2_zk.create("/test_read_write_multinode_node2", b"somedata2")
node3_zk.create("/test_read_write_multinode_node3", b"somedata3")
# stale reads are allowed
while node1_zk.exists("/test_read_write_multinode_node2") is None:
time.sleep(0.1)
while node1_zk.exists("/test_read_write_multinode_node3") is None:
time.sleep(0.1)
while node2_zk.exists("/test_read_write_multinode_node3") is None:
time.sleep(0.1)
assert node3_zk.get("/test_read_write_multinode_node1")[0] == b"somedata1"
assert node2_zk.get("/test_read_write_multinode_node1")[0] == b"somedata1"
assert node1_zk.get("/test_read_write_multinode_node1")[0] == b"somedata1"
assert node3_zk.get("/test_read_write_multinode_node2")[0] == b"somedata2"
assert node2_zk.get("/test_read_write_multinode_node2")[0] == b"somedata2"
assert node1_zk.get("/test_read_write_multinode_node2")[0] == b"somedata2"
assert node3_zk.get("/test_read_write_multinode_node3")[0] == b"somedata3"
assert node2_zk.get("/test_read_write_multinode_node3")[0] == b"somedata3"
assert node1_zk.get("/test_read_write_multinode_node3")[0] == b"somedata3"
finally:
try:
for zk_conn in [node1_zk, node2_zk, node3_zk]:
zk_conn.stop()
zk_conn.close()
except:
pass
def test_watch_on_follower(started_cluster):
try:
node1_zk = get_fake_zk("node1")
node2_zk = get_fake_zk("node2")
node3_zk = get_fake_zk("node3")
node1_zk.create("/test_data_watches")
node2_zk.set("/test_data_watches", b"hello")
node3_zk.set("/test_data_watches", b"world")
node1_data = None
def node1_callback(event):
print("node1 data watch called")
nonlocal node1_data
node1_data = event
node1_zk.get("/test_data_watches", watch=node1_callback)
node2_data = None
def node2_callback(event):
print("node2 data watch called")
nonlocal node2_data
node2_data = event
node2_zk.get("/test_data_watches", watch=node2_callback)
node3_data = None
def node3_callback(event):
print("node3 data watch called")
nonlocal node3_data
node3_data = event
node3_zk.get("/test_data_watches", watch=node3_callback)
node1_zk.set("/test_data_watches", b"somevalue")
time.sleep(3)
print(node1_data)
print(node2_data)
print(node3_data)
assert node1_data == node2_data
assert node3_data == node2_data
finally:
try:
for zk_conn in [node1_zk, node2_zk, node3_zk]:
zk_conn.stop()
zk_conn.close()
except:
pass
def test_session_expiration(started_cluster):
try:
node1_zk = get_fake_zk("node1")
node2_zk = get_fake_zk("node2")
node3_zk = get_fake_zk("node3", timeout=5.0)
node3_zk.create("/test_ephemeral_node", b"world", ephemeral=True)
with PartitionManager() as pm:
pm.partition_instances(node3, node2)
pm.partition_instances(node3, node1)
node3_zk.stop()
node3_zk.close()
for _ in range(100):
if node1_zk.exists("/test_ephemeral_node") is None and node2_zk.exists("/test_ephemeral_node") is None:
break
time.sleep(0.1)
assert node1_zk.exists("/test_ephemeral_node") is None
assert node2_zk.exists("/test_ephemeral_node") is None
finally:
try:
for zk_conn in [node1_zk, node2_zk, node3_zk]:
try:
zk_conn.stop()
zk_conn.close()
except:
pass
except:
pass
def test_follower_restart(started_cluster):
try:
node1_zk = get_fake_zk("node1")
node1_zk.create("/test_restart_node", b"hello")
node3.restart_clickhouse(kill=True)
node3_zk = get_fake_zk("node3")
# got data from log
assert node3_zk.get("/test_restart_node")[0] == b"hello"
finally:
try:
for zk_conn in [node1_zk, node3_zk]:
try:
zk_conn.stop()
zk_conn.close()
except:
pass
except:
pass
def test_simple_replicated_table(started_cluster):
# something may be wrong after partition in other tests
# so create with retry
for i, node in enumerate([node1, node2, node3]):
for i in range(100):
try:
node.query("CREATE TABLE t (value UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/t', '{}') ORDER BY tuple()".format(i + 1))
break
except:
time.sleep(0.1)
node2.query("INSERT INTO t SELECT number FROM numbers(10)")
node1.query("SYSTEM SYNC REPLICA t", timeout=10)
node3.query("SYSTEM SYNC REPLICA t", timeout=10)
assert node1.query("SELECT COUNT() FROM t") == "10\n"
assert node2.query("SELECT COUNT() FROM t") == "10\n"
assert node3.query("SELECT COUNT() FROM t") == "10\n"
# in extremely rare case it can take more than 5 minutes in debug build with sanitizer
@pytest.mark.timeout(600)
def test_blocade_leader(started_cluster):
wait_nodes()
for i, node in enumerate([node1, node2, node3]):
node.query("CREATE TABLE t1 (value UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/t1', '{}') ORDER BY tuple()".format(i + 1))
@ -337,6 +190,7 @@ def dump_zk(node, zk_path, replica_path):
# in extremely rare case it can take more than 5 minutes in debug build with sanitizer
@pytest.mark.timeout(600)
def test_blocade_leader_twice(started_cluster):
wait_nodes()
for i, node in enumerate([node1, node2, node3]):
node.query("CREATE TABLE t2 (value UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/t2', '{}') ORDER BY tuple()".format(i + 1))

View File

@ -0,0 +1 @@
#!/usr/bin/env python3

View File

@ -0,0 +1,38 @@
<yandex>
<test_keeper_server>
<tcp_port>9181</tcp_port>
<server_id>1</server_id>
<coordination_settings>
<operation_timeout_ms>5000</operation_timeout_ms>
<session_timeout_ms>10000</session_timeout_ms>
<raft_logs_level>trace</raft_logs_level>
</coordination_settings>
<raft_configuration>
<server>
<id>1</id>
<hostname>node1</hostname>
<port>44444</port>
<can_become_leader>true</can_become_leader>
<priority>3</priority>
</server>
<server>
<id>2</id>
<hostname>node2</hostname>
<port>44444</port>
<can_become_leader>true</can_become_leader>
<start_as_follower>true</start_as_follower>
<priority>2</priority>
</server>
<server>
<id>3</id>
<hostname>node3</hostname>
<port>44444</port>
<can_become_leader>true</can_become_leader>
<start_as_follower>true</start_as_follower>
<priority>1</priority>
</server>
</raft_configuration>
</test_keeper_server>
</yandex>

View File

@ -0,0 +1,38 @@
<yandex>
<test_keeper_server>
<tcp_port>9181</tcp_port>
<server_id>2</server_id>
<coordination_settings>
<operation_timeout_ms>5000</operation_timeout_ms>
<session_timeout_ms>10000</session_timeout_ms>
<raft_logs_level>trace</raft_logs_level>
</coordination_settings>
<raft_configuration>
<server>
<id>1</id>
<hostname>node1</hostname>
<port>44444</port>
<can_become_leader>true</can_become_leader>
<priority>3</priority>
</server>
<server>
<id>2</id>
<hostname>node2</hostname>
<port>44444</port>
<can_become_leader>true</can_become_leader>
<start_as_follower>true</start_as_follower>
<priority>2</priority>
</server>
<server>
<id>3</id>
<hostname>node3</hostname>
<port>44444</port>
<can_become_leader>true</can_become_leader>
<start_as_follower>true</start_as_follower>
<priority>1</priority>
</server>
</raft_configuration>
</test_keeper_server>
</yandex>

View File

@ -0,0 +1,38 @@
<yandex>
<test_keeper_server>
<tcp_port>9181</tcp_port>
<server_id>3</server_id>
<coordination_settings>
<operation_timeout_ms>5000</operation_timeout_ms>
<session_timeout_ms>10000</session_timeout_ms>
<raft_logs_level>trace</raft_logs_level>
</coordination_settings>
<raft_configuration>
<server>
<id>1</id>
<hostname>node1</hostname>
<port>44444</port>
<can_become_leader>true</can_become_leader>
<priority>3</priority>
</server>
<server>
<id>2</id>
<hostname>node2</hostname>
<port>44444</port>
<can_become_leader>true</can_become_leader>
<start_as_follower>true</start_as_follower>
<priority>2</priority>
</server>
<server>
<id>3</id>
<hostname>node3</hostname>
<port>44444</port>
<can_become_leader>true</can_become_leader>
<start_as_follower>true</start_as_follower>
<priority>1</priority>
</server>
</raft_configuration>
</test_keeper_server>
</yandex>

View File

@ -0,0 +1,12 @@
<yandex>
<shutdown_wait_unfinished>3</shutdown_wait_unfinished>
<logger>
<level>trace</level>
<log>/var/log/clickhouse-server/log.log</log>
<errorlog>/var/log/clickhouse-server/log.err.log</errorlog>
<size>1000M</size>
<count>10</count>
<stderr>/var/log/clickhouse-server/stderr.log</stderr>
<stdout>/var/log/clickhouse-server/stdout.log</stdout>
</logger>
</yandex>

View File

@ -0,0 +1,16 @@
<yandex>
<zookeeper>
<node index="1">
<host>node1</host>
<port>9181</port>
</node>
<node index="2">
<host>node2</host>
<port>9181</port>
</node>
<node index="3">
<host>node3</host>
<port>9181</port>
</node>
</zookeeper>
</yandex>

View File

@ -0,0 +1,239 @@
import pytest
from helpers.cluster import ClickHouseCluster
import random
import string
import os
import time
from multiprocessing.dummy import Pool
from helpers.network import PartitionManager
cluster = ClickHouseCluster(__file__)
node1 = cluster.add_instance('node1', main_configs=['configs/enable_test_keeper1.xml', 'configs/log_conf.xml', 'configs/use_test_keeper.xml'], stay_alive=True)
node2 = cluster.add_instance('node2', main_configs=['configs/enable_test_keeper2.xml', 'configs/log_conf.xml', 'configs/use_test_keeper.xml'], stay_alive=True)
node3 = cluster.add_instance('node3', main_configs=['configs/enable_test_keeper3.xml', 'configs/log_conf.xml', 'configs/use_test_keeper.xml'], stay_alive=True)
from kazoo.client import KazooClient, KazooState
@pytest.fixture(scope="module")
def started_cluster():
try:
cluster.start()
yield cluster
finally:
cluster.shutdown()
def smaller_exception(ex):
return '\n'.join(str(ex).split('\n')[0:2])
def wait_node(node):
for _ in range(100):
zk = None
try:
node.query("SELECT * FROM system.zookeeper WHERE path = '/'")
zk = get_fake_zk(node.name, timeout=30.0)
zk.create("/test", sequence=True)
print("node", node.name, "ready")
break
except Exception as ex:
time.sleep(0.2)
print("Waiting until", node.name, "will be ready, exception", ex)
finally:
if zk:
zk.stop()
zk.close()
else:
raise Exception("Can't wait node", node.name, "to become ready")
def wait_nodes():
for node in [node1, node2, node3]:
wait_node(node)
def get_fake_zk(nodename, timeout=30.0):
_fake_zk_instance = KazooClient(hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout)
def reset_listener(state):
nonlocal _fake_zk_instance
print("Fake zk callback called for state", state)
if state != KazooState.CONNECTED:
_fake_zk_instance._reset()
_fake_zk_instance.add_listener(reset_listener)
_fake_zk_instance.start()
return _fake_zk_instance
def test_read_write_multinode(started_cluster):
try:
wait_nodes()
node1_zk = get_fake_zk("node1")
node2_zk = get_fake_zk("node2")
node3_zk = get_fake_zk("node3")
node1_zk.create("/test_read_write_multinode_node1", b"somedata1")
node2_zk.create("/test_read_write_multinode_node2", b"somedata2")
node3_zk.create("/test_read_write_multinode_node3", b"somedata3")
# stale reads are allowed
while node1_zk.exists("/test_read_write_multinode_node2") is None:
time.sleep(0.1)
while node1_zk.exists("/test_read_write_multinode_node3") is None:
time.sleep(0.1)
while node2_zk.exists("/test_read_write_multinode_node3") is None:
time.sleep(0.1)
assert node3_zk.get("/test_read_write_multinode_node1")[0] == b"somedata1"
assert node2_zk.get("/test_read_write_multinode_node1")[0] == b"somedata1"
assert node1_zk.get("/test_read_write_multinode_node1")[0] == b"somedata1"
assert node3_zk.get("/test_read_write_multinode_node2")[0] == b"somedata2"
assert node2_zk.get("/test_read_write_multinode_node2")[0] == b"somedata2"
assert node1_zk.get("/test_read_write_multinode_node2")[0] == b"somedata2"
assert node3_zk.get("/test_read_write_multinode_node3")[0] == b"somedata3"
assert node2_zk.get("/test_read_write_multinode_node3")[0] == b"somedata3"
assert node1_zk.get("/test_read_write_multinode_node3")[0] == b"somedata3"
finally:
try:
for zk_conn in [node1_zk, node2_zk, node3_zk]:
zk_conn.stop()
zk_conn.close()
except:
pass
def test_watch_on_follower(started_cluster):
try:
wait_nodes()
node1_zk = get_fake_zk("node1")
node2_zk = get_fake_zk("node2")
node3_zk = get_fake_zk("node3")
node1_zk.create("/test_data_watches")
node2_zk.set("/test_data_watches", b"hello")
node3_zk.set("/test_data_watches", b"world")
node1_data = None
def node1_callback(event):
print("node1 data watch called")
nonlocal node1_data
node1_data = event
node1_zk.get("/test_data_watches", watch=node1_callback)
node2_data = None
def node2_callback(event):
print("node2 data watch called")
nonlocal node2_data
node2_data = event
node2_zk.get("/test_data_watches", watch=node2_callback)
node3_data = None
def node3_callback(event):
print("node3 data watch called")
nonlocal node3_data
node3_data = event
node3_zk.get("/test_data_watches", watch=node3_callback)
node1_zk.set("/test_data_watches", b"somevalue")
time.sleep(3)
print(node1_data)
print(node2_data)
print(node3_data)
assert node1_data == node2_data
assert node3_data == node2_data
finally:
try:
for zk_conn in [node1_zk, node2_zk, node3_zk]:
zk_conn.stop()
zk_conn.close()
except:
pass
def test_session_expiration(started_cluster):
try:
wait_nodes()
node1_zk = get_fake_zk("node1")
node2_zk = get_fake_zk("node2")
node3_zk = get_fake_zk("node3", timeout=3.0)
print("Node3 session id", node3_zk._session_id)
node3_zk.create("/test_ephemeral_node", b"world", ephemeral=True)
with PartitionManager() as pm:
pm.partition_instances(node3, node2)
pm.partition_instances(node3, node1)
node3_zk.stop()
node3_zk.close()
for _ in range(100):
if node1_zk.exists("/test_ephemeral_node") is None and node2_zk.exists("/test_ephemeral_node") is None:
break
print("Node1 exists", node1_zk.exists("/test_ephemeral_node"))
print("Node2 exists", node2_zk.exists("/test_ephemeral_node"))
time.sleep(0.1)
node1_zk.sync("/")
node2_zk.sync("/")
assert node1_zk.exists("/test_ephemeral_node") is None
assert node2_zk.exists("/test_ephemeral_node") is None
finally:
try:
for zk_conn in [node1_zk, node2_zk, node3_zk]:
try:
zk_conn.stop()
zk_conn.close()
except:
pass
except:
pass
def test_follower_restart(started_cluster):
try:
wait_nodes()
node1_zk = get_fake_zk("node1")
node1_zk.create("/test_restart_node", b"hello")
node3.restart_clickhouse(kill=True)
node3_zk = get_fake_zk("node3")
# got data from log
assert node3_zk.get("/test_restart_node")[0] == b"hello"
finally:
try:
for zk_conn in [node1_zk, node3_zk]:
try:
zk_conn.stop()
zk_conn.close()
except:
pass
except:
pass
def test_simple_replicated_table(started_cluster):
wait_nodes()
for i, node in enumerate([node1, node2, node3]):
node.query("CREATE TABLE t (value UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/t', '{}') ORDER BY tuple()".format(i + 1))
node2.query("INSERT INTO t SELECT number FROM numbers(10)")
node1.query("SYSTEM SYNC REPLICA t", timeout=10)
node3.query("SYSTEM SYNC REPLICA t", timeout=10)
assert node1.query("SELECT COUNT() FROM t") == "10\n"
assert node2.query("SELECT COUNT() FROM t") == "10\n"
assert node3.query("SELECT COUNT() FROM t") == "10\n"