ClickHouse/tests/integration/test_s3_zero_copy_replication/test.py
2024-11-24 14:26:38 +00:00

762 lines
24 KiB
Python

import datetime
import logging
import threading
import time
import pytest
from helpers.cluster import ClickHouseCluster
from helpers.network import PartitionManager
logging.getLogger().setLevel(logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler())
cluster = ClickHouseCluster(__file__)
@pytest.fixture(scope="module")
def started_cluster():
try:
cluster.add_instance(
"node1",
main_configs=["configs/config.d/s3.xml"],
macros={"replica": "1"},
with_minio=True,
with_zookeeper=True,
)
cluster.add_instance(
"node2",
main_configs=["configs/config.d/s3.xml"],
macros={"replica": "2"},
with_minio=True,
with_zookeeper=True,
)
logging.info("Starting cluster...")
cluster.start()
logging.info("Cluster started")
yield cluster
finally:
cluster.shutdown()
def get_large_objects_count(cluster, size=100, folder="data"):
minio = cluster.minio_client
counter = 0
for obj in minio.list_objects(
cluster.minio_bucket, "{}/".format(folder), recursive=True
):
if obj.size is not None and obj.size >= size:
counter = counter + 1
return counter
def check_objects_exist(cluster, object_list, folder="data"):
minio = cluster.minio_client
for obj in object_list:
if obj:
minio.stat_object(cluster.minio_bucket, "{}/{}".format(folder, obj))
def check_objects_not_exisis(cluster, object_list, folder="data"):
minio = cluster.minio_client
for obj in object_list:
if obj:
try:
minio.stat_object(cluster.minio_bucket, "{}/{}".format(folder, obj))
except Exception as error:
assert "NoSuchKey" in str(error)
else:
assert False, "Object {} should not be exists".format(obj)
def wait_for_large_objects_count(cluster, expected, size=100, timeout=30):
while timeout > 0:
if get_large_objects_count(cluster, size=size) == expected:
return
timeout -= 1
time.sleep(1)
assert get_large_objects_count(cluster, size=size) == expected
def wait_for_active_parts(
node, num_expected_parts, table_name, timeout=30, disk_name=None
):
deadline = time.monotonic() + timeout
num_parts = 0
while time.monotonic() < deadline:
query = (
f"select count() from system.parts where table = '{table_name}' and active"
)
if disk_name:
query += f" and disk_name='{disk_name}'"
num_parts_str = node.query(query)
num_parts = int(num_parts_str.strip())
if num_parts == num_expected_parts:
return
time.sleep(0.2)
assert num_parts == num_expected_parts
@pytest.fixture(scope="function")
def test_name(request):
return request.node.name
@pytest.fixture(scope="function")
def test_table(test_name):
normalized = (
test_name.replace("[", "_")
.replace("]", "_")
.replace(" ", "_")
.replace("-", "_")
)
return "table_" + normalized
# Result of `get_large_objects_count` can be changed in other tests, so run this case at the beginning
@pytest.mark.order(0)
@pytest.mark.parametrize("policy", ["s3"])
def test_s3_zero_copy_replication(started_cluster, policy):
node1 = cluster.instances["node1"]
node2 = cluster.instances["node2"]
node1.query(
"""
CREATE TABLE s3_test ON CLUSTER test_cluster (id UInt32, value String)
ENGINE=ReplicatedMergeTree('/clickhouse/tables/s3_test', '{}')
ORDER BY id
SETTINGS storage_policy='{}'
""".format(
"{replica}", policy
)
)
node1.query("INSERT INTO s3_test VALUES (0,'data'),(1,'data')")
node2.query("SYSTEM SYNC REPLICA s3_test", timeout=30)
assert (
node1.query("SELECT * FROM s3_test order by id FORMAT Values")
== "(0,'data'),(1,'data')"
)
assert (
node2.query("SELECT * FROM s3_test order by id FORMAT Values")
== "(0,'data'),(1,'data')"
)
# Based on version 21.x - should be only 1 file with size 100+ (checksums.txt), used by both nodes
assert get_large_objects_count(cluster) == 1
node2.query("INSERT INTO s3_test VALUES (2,'data'),(3,'data')")
node1.query("SYSTEM SYNC REPLICA s3_test", timeout=30)
assert (
node2.query("SELECT * FROM s3_test order by id FORMAT Values")
== "(0,'data'),(1,'data'),(2,'data'),(3,'data')"
)
assert (
node1.query("SELECT * FROM s3_test order by id FORMAT Values")
== "(0,'data'),(1,'data'),(2,'data'),(3,'data')"
)
# Based on version 21.x - two parts
wait_for_large_objects_count(cluster, 2)
node1.query("OPTIMIZE TABLE s3_test FINAL")
# Based on version 21.x - after merge, two old parts and one merged
wait_for_large_objects_count(cluster, 3)
# Based on version 21.x - after cleanup - only one merged part
wait_for_large_objects_count(cluster, 1, timeout=60)
node1.query("DROP TABLE IF EXISTS s3_test SYNC")
node2.query("DROP TABLE IF EXISTS s3_test SYNC")
def insert_data_time(node, table, number_of_mb, time, start=0):
values = ",".join(
f"({x},{time})"
for x in range(start, int((1024 * 1024 * number_of_mb) / 8) + start + 1)
)
node.query(f"INSERT INTO {table} VALUES {values}")
def insert_large_data(node, table):
tm = time.mktime((datetime.date.today() - datetime.timedelta(days=7)).timetuple())
insert_data_time(node, table, 1, tm, 0)
tm = time.mktime((datetime.date.today() - datetime.timedelta(days=3)).timetuple())
insert_data_time(node, table, 1, tm, 1024 * 1024)
tm = time.mktime(datetime.date.today().timetuple())
insert_data_time(node, table, 10, tm, 1024 * 1024 * 2)
@pytest.mark.parametrize(
("storage_policy", "large_data", "iterations"),
[
("tiered", False, 10),
("tiered_copy", False, 10),
("tiered", True, 3),
("tiered_copy", True, 3),
],
)
def test_s3_zero_copy_with_ttl_move(
started_cluster, storage_policy, large_data, iterations
):
node1 = cluster.instances["node1"]
node2 = cluster.instances["node2"]
node1.query("DROP TABLE IF EXISTS ttl_move_test SYNC")
node2.query("DROP TABLE IF EXISTS ttl_move_test SYNC")
for i in range(iterations):
node1.query(
"""
CREATE TABLE ttl_move_test ON CLUSTER test_cluster (d UInt64, d1 DateTime)
ENGINE=ReplicatedMergeTree('/clickhouse/tables/ttl_move_test', '{}')
ORDER BY d
TTL d1 + INTERVAL 2 DAY TO VOLUME 'external'
SETTINGS storage_policy='{}'
""".format(
"{replica}", storage_policy
)
)
if large_data:
insert_large_data(node1, "ttl_move_test")
else:
node1.query("INSERT INTO ttl_move_test VALUES (10, now() - INTERVAL 3 DAY)")
node1.query("INSERT INTO ttl_move_test VALUES (11, now() - INTERVAL 1 DAY)")
node1.query("OPTIMIZE TABLE ttl_move_test FINAL")
node2.query("SYSTEM SYNC REPLICA ttl_move_test", timeout=30)
if large_data:
assert (
node1.query("SELECT count() FROM ttl_move_test FORMAT Values")
== "(1572867)"
)
assert (
node2.query("SELECT count() FROM ttl_move_test FORMAT Values")
== "(1572867)"
)
else:
assert (
node1.query("SELECT count() FROM ttl_move_test FORMAT Values") == "(2)"
)
assert (
node2.query("SELECT count() FROM ttl_move_test FORMAT Values") == "(2)"
)
assert (
node1.query("SELECT d FROM ttl_move_test ORDER BY d FORMAT Values")
== "(10),(11)"
)
assert (
node2.query("SELECT d FROM ttl_move_test ORDER BY d FORMAT Values")
== "(10),(11)"
)
node1.query("DROP TABLE IF EXISTS ttl_move_test SYNC")
node2.query("DROP TABLE IF EXISTS ttl_move_test SYNC")
@pytest.mark.parametrize(
("large_data", "iterations"),
[
(False, 10),
(True, 3),
],
)
def test_s3_zero_copy_with_ttl_delete(started_cluster, large_data, iterations):
node1 = cluster.instances["node1"]
node2 = cluster.instances["node2"]
node1.query("DROP TABLE IF EXISTS ttl_delete_test SYNC")
node2.query("DROP TABLE IF EXISTS ttl_delete_test SYNC")
for i in range(iterations):
node1.query(
"""
CREATE TABLE ttl_delete_test ON CLUSTER test_cluster (d UInt64, d1 DateTime)
ENGINE=ReplicatedMergeTree('/clickhouse/tables/ttl_delete_test', '{}')
ORDER BY d
TTL d1 + INTERVAL 2 DAY
SETTINGS storage_policy='tiered'
""".format(
"{replica}"
)
)
if large_data:
insert_large_data(node1, "ttl_delete_test")
else:
node1.query(
"INSERT INTO ttl_delete_test VALUES (10, now() - INTERVAL 3 DAY)"
)
node1.query(
"INSERT INTO ttl_delete_test VALUES (11, now() - INTERVAL 1 DAY)"
)
node1.query("OPTIMIZE TABLE ttl_delete_test FINAL")
node1.query("SYSTEM SYNC REPLICA ttl_delete_test", timeout=30)
node2.query("SYSTEM SYNC REPLICA ttl_delete_test", timeout=30)
if large_data:
assert (
node1.query("SELECT count() FROM ttl_delete_test FORMAT Values")
== "(1310721)"
)
assert (
node2.query("SELECT count() FROM ttl_delete_test FORMAT Values")
== "(1310721)"
)
else:
assert (
node1.query("SELECT count() FROM ttl_delete_test FORMAT Values")
== "(1)"
)
assert (
node2.query("SELECT count() FROM ttl_delete_test FORMAT Values")
== "(1)"
)
assert (
node1.query("SELECT d FROM ttl_delete_test ORDER BY d FORMAT Values")
== "(11)"
)
assert (
node2.query("SELECT d FROM ttl_delete_test ORDER BY d FORMAT Values")
== "(11)"
)
node1.query("DROP TABLE IF EXISTS ttl_delete_test SYNC")
node2.query("DROP TABLE IF EXISTS ttl_delete_test SYNC")
def wait_mutations(node, table, seconds):
time.sleep(1)
while seconds > 0:
seconds -= 1
mutations = node.query(
f"SELECT count() FROM system.mutations WHERE table='{table}' AND is_done=0"
)
if mutations == "0\n":
return
time.sleep(1)
mutations = node.query(
f"SELECT count() FROM system.mutations WHERE table='{table}' AND is_done=0"
)
assert mutations == "0\n"
def wait_for_clean_old_parts(node, table, seconds):
time.sleep(1)
while seconds > 0:
seconds -= 1
parts = node.query(
f"SELECT count() FROM system.parts WHERE table='{table}' AND active=0"
)
if parts == "0\n":
return
time.sleep(1)
parts = node.query(
f"SELECT count() FROM system.parts WHERE table='{table}' AND active=0"
)
assert parts == "0\n"
def s3_zero_copy_unfreeze_base(cluster, unfreeze_query_template):
node1 = cluster.instances["node1"]
node2 = cluster.instances["node2"]
node1.query("DROP TABLE IF EXISTS unfreeze_test SYNC")
node2.query("DROP TABLE IF EXISTS unfreeze_test SYNC")
node1.query(
"""
CREATE TABLE unfreeze_test ON CLUSTER test_cluster (d UInt64)
ENGINE=ReplicatedMergeTree('/clickhouse/tables/unfreeze_test', '{}')
ORDER BY d
SETTINGS storage_policy='s3'
""".format(
"{replica}"
)
)
node1.query("INSERT INTO unfreeze_test VALUES (0)")
wait_for_active_parts(node2, 1, "unfreeze_test")
node1.query("ALTER TABLE unfreeze_test FREEZE WITH NAME 'freeze_backup1'")
node2.query("ALTER TABLE unfreeze_test FREEZE WITH NAME 'freeze_backup2'")
wait_mutations(node1, "unfreeze_test", 10)
wait_mutations(node2, "unfreeze_test", 10)
objects01 = node1.get_backuped_s3_objects("s31", "freeze_backup1")
objects02 = node2.get_backuped_s3_objects("s31", "freeze_backup2")
assert objects01 == objects02
check_objects_exist(cluster, objects01)
node1.query("TRUNCATE TABLE unfreeze_test")
node2.query("SYSTEM SYNC REPLICA unfreeze_test", timeout=30)
objects11 = node1.get_backuped_s3_objects("s31", "freeze_backup1")
objects12 = node2.get_backuped_s3_objects("s31", "freeze_backup2")
assert objects01 == objects11
assert objects01 == objects12
check_objects_exist(cluster, objects11)
node1.query(f"{unfreeze_query_template} 'freeze_backup1'")
wait_mutations(node1, "unfreeze_test", 10)
check_objects_exist(cluster, objects12)
node2.query(f"{unfreeze_query_template} 'freeze_backup2'")
wait_mutations(node2, "unfreeze_test", 10)
check_objects_not_exisis(cluster, objects12)
node1.query("DROP TABLE IF EXISTS unfreeze_test SYNC")
node2.query("DROP TABLE IF EXISTS unfreeze_test SYNC")
def test_s3_zero_copy_unfreeze_alter(started_cluster):
s3_zero_copy_unfreeze_base(cluster, "ALTER TABLE unfreeze_test UNFREEZE WITH NAME")
def test_s3_zero_copy_unfreeze_system(started_cluster):
s3_zero_copy_unfreeze_base(cluster, "SYSTEM UNFREEZE WITH NAME")
def s3_zero_copy_drop_detached(cluster, unfreeze_query_template):
node1 = cluster.instances["node1"]
node2 = cluster.instances["node2"]
node1.query("DROP TABLE IF EXISTS drop_detached_test SYNC")
node2.query("DROP TABLE IF EXISTS drop_detached_test SYNC")
node1.query(
"""
CREATE TABLE drop_detached_test ON CLUSTER test_cluster (d UInt64)
ENGINE=ReplicatedMergeTree('/clickhouse/tables/drop_detached_test', '{}')
ORDER BY d PARTITION BY d
SETTINGS storage_policy='s3'
""".format(
"{replica}"
)
)
node1.query("INSERT INTO drop_detached_test VALUES (0)")
node1.query("ALTER TABLE drop_detached_test FREEZE WITH NAME 'detach_backup1'")
node1.query("INSERT INTO drop_detached_test VALUES (1)")
node1.query("ALTER TABLE drop_detached_test FREEZE WITH NAME 'detach_backup2'")
node2.query("SYSTEM SYNC REPLICA drop_detached_test", timeout=30)
objects1 = node1.get_backuped_s3_objects("s31", "detach_backup1")
objects2 = node1.get_backuped_s3_objects("s31", "detach_backup2")
objects_diff = list(set(objects2) - set(objects1))
node1.query(f"{unfreeze_query_template} 'detach_backup2'")
node1.query(f"{unfreeze_query_template} 'detach_backup1'")
node1.query("ALTER TABLE drop_detached_test DETACH PARTITION '0'")
node1.query("ALTER TABLE drop_detached_test DETACH PARTITION '1'")
node2.query("SYSTEM SYNC REPLICA drop_detached_test", timeout=30)
wait_mutations(node1, "drop_detached_test", 10)
wait_mutations(node2, "drop_detached_test", 10)
check_objects_exist(cluster, objects1)
check_objects_exist(cluster, objects2)
node2.query(
"ALTER TABLE drop_detached_test DROP DETACHED PARTITION '1'",
settings={"allow_drop_detached": 1},
)
node1.query("SYSTEM SYNC REPLICA drop_detached_test", timeout=30)
wait_mutations(node1, "drop_detached_test", 10)
wait_mutations(node2, "drop_detached_test", 10)
check_objects_exist(cluster, objects1)
check_objects_exist(cluster, objects2)
node1.query(
"ALTER TABLE drop_detached_test DROP DETACHED PARTITION '1'",
settings={"allow_drop_detached": 1},
)
node2.query("SYSTEM SYNC REPLICA drop_detached_test", timeout=30)
wait_mutations(node1, "drop_detached_test", 10)
wait_mutations(node2, "drop_detached_test", 10)
check_objects_exist(cluster, objects1)
check_objects_not_exisis(cluster, objects_diff)
node1.query(
"ALTER TABLE drop_detached_test DROP DETACHED PARTITION '0'",
settings={"allow_drop_detached": 1},
)
node2.query("SYSTEM SYNC REPLICA drop_detached_test", timeout=30)
wait_mutations(node1, "drop_detached_test", 10)
wait_mutations(node2, "drop_detached_test", 10)
check_objects_exist(cluster, objects1)
node2.query(
"ALTER TABLE drop_detached_test DROP DETACHED PARTITION '0'",
settings={"allow_drop_detached": 1},
)
node1.query("SYSTEM SYNC REPLICA drop_detached_test", timeout=30)
wait_mutations(node1, "drop_detached_test", 10)
wait_mutations(node2, "drop_detached_test", 10)
check_objects_not_exisis(cluster, objects1)
def test_s3_zero_copy_drop_detached_alter(started_cluster):
s3_zero_copy_drop_detached(
cluster, "ALTER TABLE drop_detached_test UNFREEZE WITH NAME"
)
def test_s3_zero_copy_drop_detached_system(started_cluster):
s3_zero_copy_drop_detached(cluster, "SYSTEM UNFREEZE WITH NAME")
def test_s3_zero_copy_concurrent_merge(started_cluster):
node1 = cluster.instances["node1"]
node2 = cluster.instances["node2"]
node1.query("DROP TABLE IF EXISTS concurrent_merge SYNC")
node2.query("DROP TABLE IF EXISTS concurrent_merge SYNC")
for node in (node1, node2):
node.query(
"""
CREATE TABLE concurrent_merge (id UInt64)
ENGINE=ReplicatedMergeTree('/clickhouse/tables/concurrent_merge', '{replica}')
ORDER BY id
SETTINGS index_granularity=2, storage_policy='s3', remote_fs_execute_merges_on_single_replica_time_threshold=1
"""
)
node1.query("system stop merges")
node2.query("system stop merges")
# This will generate two parts with 20 granules each
node1.query("insert into concurrent_merge select number from numbers(40)")
node1.query("insert into concurrent_merge select number + 1 from numbers(40)")
wait_for_active_parts(node2, 2, "concurrent_merge")
# Merge will materialize default column, it should sleep every granule and take 20 * 2 * 0.1 = 4 sec.
node1.query("alter table concurrent_merge add column x UInt32 default sleep(0.1)")
node1.query("system start merges")
node2.query("system start merges")
# Now, the merge should start.
# Because of remote_fs_execute_merges_on_single_replica_time_threshold=1,
# only one replica will start merge instantly.
# The other replica should wait for 1 sec and also start it.
# That should probably cause a data race at s3 storage.
# For now, it does not happen (every blob has a random name, and we just have a duplicating data)
node1.query("optimize table concurrent_merge final")
wait_for_active_parts(node1, 1, "concurrent_merge")
wait_for_active_parts(node2, 1, "concurrent_merge")
for node in (node1, node2):
assert node.query("select sum(id) from concurrent_merge").strip() == "1600"
def test_s3_zero_copy_keeps_data_after_mutation(started_cluster):
node1 = cluster.instances["node1"]
node2 = cluster.instances["node2"]
node1.query("DROP TABLE IF EXISTS zero_copy_mutation SYNC")
node2.query("DROP TABLE IF EXISTS zero_copy_mutation SYNC")
node1.query(
"""
CREATE TABLE zero_copy_mutation (id UInt64, value1 String, value2 String, value3 String)
ENGINE=ReplicatedMergeTree('/clickhouse/tables/zero_copy_mutation', '{replica}')
ORDER BY id
PARTITION BY (id % 4)
SETTINGS storage_policy='s3',
old_parts_lifetime=1000
"""
)
node2.query(
"""
CREATE TABLE zero_copy_mutation (id UInt64, value1 String, value2 String, value3 String)
ENGINE=ReplicatedMergeTree('/clickhouse/tables/zero_copy_mutation', '{replica}')
ORDER BY id
PARTITION BY (id % 4)
SETTINGS storage_policy='s3',
old_parts_lifetime=1000
"""
)
node1.query(
"""
INSERT INTO zero_copy_mutation
SELECT * FROM generateRandom('id UInt64, value1 String, value2 String, value3 String') limit 1000000
"""
)
wait_for_active_parts(node2, 4, "zero_copy_mutation")
objects1 = node1.get_table_objects("zero_copy_mutation")
check_objects_exist(cluster, objects1)
node1.query(
"""
ALTER TABLE zero_copy_mutation
ADD COLUMN valueX String MATERIALIZED value1
"""
)
node1.query(
"""
ALTER TABLE zero_copy_mutation
MATERIALIZE COLUMN valueX
"""
)
wait_mutations(node1, "zero_copy_mutation", 10)
wait_mutations(node2, "zero_copy_mutation", 10)
# If bug present at least one node has metadata with incorrect ref_count values.
# But it may be any node depends on mutation execution order.
# We can try to find one, but this required knowledge about internal metadata structure.
# It can be change in future, so we do not find this node here.
# And with the bug test may be success sometimes.
nodeX = node1
nodeY = node2
objectsY = nodeY.get_table_objects("zero_copy_mutation")
check_objects_exist(cluster, objectsY)
nodeX.query(
"""
ALTER TABLE zero_copy_mutation
DETACH PARTITION '0'
"""
)
nodeX.query(
"""
ALTER TABLE zero_copy_mutation
ATTACH PARTITION '0'
"""
)
wait_mutations(node1, "zero_copy_mutation", 10)
wait_mutations(node2, "zero_copy_mutation", 10)
nodeX.query(
"""
DROP TABLE zero_copy_mutation SYNC
"""
)
# time to remove objects
time.sleep(10)
nodeY.query(
"""
SELECT count() FROM zero_copy_mutation
WHERE value3 LIKE '%ab%'
"""
)
check_objects_exist(cluster, objectsY)
nodeY.query(
"""
DROP TABLE zero_copy_mutation SYNC
"""
)
# time to remove objects
time.sleep(10)
check_objects_not_exisis(cluster, objectsY)
@pytest.mark.parametrize(
"failpoint_lock",
["zero_copy_lock_zk_fail_before_op", "zero_copy_lock_zk_fail_after_op"],
)
@pytest.mark.parametrize(
"failpoint_unlock",
[None, "zero_copy_unlock_zk_fail_before_op", "zero_copy_unlock_zk_fail_after_op"],
)
def test_move_shared_zero_copy_lock_fail(
started_cluster, test_table, failpoint_lock, failpoint_unlock
):
node1 = cluster.instances["node1"]
node2 = cluster.instances["node2"]
node1.query(
f"""
CREATE TABLE {test_table} ON CLUSTER test_cluster (num UInt64, date DateTime)
ENGINE=ReplicatedMergeTree('/clickhouse/tables/{test_table}', '{{replica}}')
ORDER BY date PARTITION BY date
SETTINGS storage_policy='hybrid'
"""
)
date = "2024-10-23"
node2.query(f"SYSTEM STOP FETCHES {test_table}")
node1.query(f"INSERT INTO {test_table} VALUES (1, '{date}')")
# Try to move and get fail on acquring zero-copy shared lock
node1.query(f"SYSTEM ENABLE FAILPOINT {failpoint_lock}")
if failpoint_unlock:
node1.query(f"SYSTEM ENABLE FAILPOINT {failpoint_unlock}")
node1.query_and_get_error(
f"ALTER TABLE {test_table} MOVE PARTITION '{date}' TO VOLUME 'external'"
)
# After fail the part must remain on the source disk
assert (
node1.query(
f"SELECT disk_name FROM system.parts WHERE table='{test_table}' GROUP BY disk_name"
)
== "default\n"
)
# Try another attempt after zk connection is restored
# It should not failed due to leftovers of previous attempt (temporary cloned files)
node1.query(f"SYSTEM DISABLE FAILPOINT {failpoint_lock}")
if failpoint_unlock:
node1.query(f"SYSTEM DISABLE FAILPOINT {failpoint_unlock}")
node1.query(
f"ALTER TABLE {test_table} MOVE PARTITION '{date}' TO VOLUME 'external'"
)
assert (
node1.query(
f"SELECT disk_name FROM system.parts WHERE table='{test_table}' GROUP BY disk_name"
)
== "s31\n"
)
# Sanity check
node2.query(f"SYSTEM START FETCHES {test_table}")
wait_for_active_parts(node2, 1, test_table, disk_name="s31")
assert node2.query(f"SELECT sum(num) FROM {test_table}") == "1\n"
node1.query(f"DROP TABLE IF EXISTS {test_table} SYNC")
node2.query(f"DROP TABLE IF EXISTS {test_table} SYNC")