update metadata on replica recovery

This commit is contained in:
Alexander Tokmakov 2021-04-28 20:49:27 +03:00
parent a4a2a61eed
commit bbf3bbc76d
3 changed files with 172 additions and 14 deletions

View File

@ -121,6 +121,7 @@ struct ReplicatedMergeTreeLogEntryData
int alter_version = -1; /// May be equal to -1, if it's normal mutation, not metadata update.
/// only ALTER METADATA command
/// TODO Seems like it's never used
bool have_mutation = false; /// If this alter requires additional mutation step, for data update
String columns_str; /// New columns data corresponding to alter_version

View File

@ -2530,7 +2530,7 @@ void StorageReplicatedMergeTree::cloneReplica(const String & source_replica, Coo
/// Let's remember the queue of the reference/master replica.
source_queue_names = zookeeper->getChildren(source_path + "/queue");
/// Check that our log pointer didn't changed while we read queue entries
/// Check that log pointer of source replica didn't changed while we read queue entries
ops.push_back(zkutil::makeCheckRequest(source_path + "/log_pointer", log_pointer_stat.version));
auto rc = zookeeper->tryMulti(ops, responses);
@ -2576,6 +2576,65 @@ void StorageReplicatedMergeTree::cloneReplica(const String & source_replica, Coo
source_queue.push_back(entry);
}
/// We should do it after copying queue, because some ALTER_METADATA entries can be lost otherwise.
Int32 source_metadata_version = parse<Int32>(zookeeper->get(source_path + "/metadata_version"));
if (metadata_version != source_metadata_version)
{
/// Our metadata it not up to date with source replica metadata.
/// Metadata is updated by ALTER_METADATA entries, but some entries are probably cleaned up from the log.
/// It's also possible that some newer ALTER_METADATA entries are present in source_queue list,
/// and source replica are executing such entry right not (or had executed recently).
/// More than that, /metadata_version update is not atomic with /columns and /metadata update...
/// Fortunately, ALTER_METADATA seems to be idempotent,
/// and older entries of such type can be replaced with newer entries.
/// Let's try to get consistent values of source replica's /columns and /metadata
/// and prepend dummy ALTER_METADATA to our replication queue.
/// It should not break anything if source_queue already contains ALTER_METADATA entry
/// with greater or equal metadata_version, but it will update our metadata
/// if all such entries were cleaned up from the log and source_queue.
LOG_WARNING(log, "Metadata version ({}) on replica is not up to date with metadata ({}) on source replica {}",
metadata_version, source_metadata_version, source_replica);
String source_metadata;
String source_columns;
while (true)
{
Coordination::Stat metadata_stat;
Coordination::Stat columns_stat;
source_metadata = zookeeper->get(source_path + "/metadata", &metadata_stat);
source_columns = zookeeper->get(source_path + "/columns", &columns_stat);
Coordination::Requests ops;
Coordination::Responses responses;
ops.emplace_back(zkutil::makeCheckRequest(source_path + "/metadata", metadata_stat.version));
ops.emplace_back(zkutil::makeCheckRequest(source_path + "/columns", columns_stat.version));
Coordination::Error code = zookeeper->tryMulti(ops, responses);
if (code == Coordination::Error::ZOK)
break;
else if (code == Coordination::Error::ZBADVERSION)
LOG_WARNING(log, "Metadata of replica {} was changed", source_path);
else
zkutil::KeeperMultiException::check(code, ops, responses);
}
ReplicatedMergeTreeLogEntryData dummy_alter;
dummy_alter.type = LogEntry::ALTER_METADATA;
dummy_alter.source_replica = source_replica;
dummy_alter.metadata_str = source_metadata;
dummy_alter.columns_str = source_columns;
dummy_alter.alter_version = source_metadata_version;
dummy_alter.create_time = time(nullptr);
/// We don't really know if it has mutation or not, but it doesn't matter.
dummy_alter.have_mutation = false;
zookeeper->create(replica_path + "/queue/queue-", dummy_alter.toString(), zkutil::CreateMode::PersistentSequential);
/// TODO Do we also need to do something with mutation_pointer? Seems like we don't, but it's not clear from the code.
}
/// Add to the queue jobs to receive all the active parts that the reference/master replica has.
Strings source_replica_parts = zookeeper->getChildren(source_path + "/parts");
ActiveDataPartSet active_parts_set(format_version, source_replica_parts);
@ -4410,6 +4469,15 @@ bool StorageReplicatedMergeTree::optimize(
bool StorageReplicatedMergeTree::executeMetadataAlter(const StorageReplicatedMergeTree::LogEntry & entry)
{
if (entry.alter_version < metadata_version)
{
/// TODO Can we replace it with LOGICAL_ERROR?
LOG_WARNING(log, "Attempt to update metadata of version {} "
"to older version {} when processing log entry {}: {}",
metadata_version, entry.alter_version, entry.znode_name, entry.toString());
return true;
}
auto zookeeper = getZooKeeper();
auto columns_from_entry = ColumnsDescription::parse(entry.columns_str);
@ -4605,14 +4673,15 @@ void StorageReplicatedMergeTree::alter(
auto maybe_mutation_commands = commands.getMutationCommands(
*current_metadata, query_context->getSettingsRef().materialize_ttl_after_modify, query_context);
alter_entry->have_mutation = !maybe_mutation_commands.empty();
bool have_mutation = !maybe_mutation_commands.empty();
alter_entry->have_mutation = have_mutation;
alter_path_idx = ops.size();
ops.emplace_back(zkutil::makeCreateRequest(
zookeeper_path + "/log/log-", alter_entry->toString(), zkutil::CreateMode::PersistentSequential));
PartitionBlockNumbersHolder partition_block_numbers_holder;
if (alter_entry->have_mutation)
if (have_mutation)
{
const String mutations_path(zookeeper_path + "/mutations");
@ -4657,7 +4726,7 @@ void StorageReplicatedMergeTree::alter(
if (rc == Coordination::Error::ZOK)
{
if (alter_entry->have_mutation)
if (have_mutation)
{
/// ALTER_METADATA record in replication /log
String alter_path = dynamic_cast<const Coordination::CreateResponse &>(*results[alter_path_idx]).path_created;

View File

@ -4,28 +4,34 @@ import pytest
from helpers.cluster import ClickHouseCluster
from helpers.test_tools import assert_eq_with_retry
SETTINGS = "SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0"
def fill_nodes(nodes, shard):
def fill_nodes(nodes):
for node in nodes:
node.query(
'''
CREATE TABLE test_table(date Date, id UInt32)
ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/replicated', '{replica}') ORDER BY id PARTITION BY toYYYYMM(date) SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0;
'''.format(shard=shard, replica=node.name))
ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/replicated', '{replica}') ORDER BY id PARTITION BY toYYYYMM(date)
{settings};
'''.format(replica=node.name, settings=SETTINGS))
cluster = ClickHouseCluster(__file__)
node1 = cluster.add_instance('node1', with_zookeeper=True)
node2 = cluster.add_instance('node2', with_zookeeper=True)
node3 = cluster.add_instance('node3', with_zookeeper=True)
nodes = [node1, node2, node3]
def sync_replicas(table):
for node in nodes:
node.query("SYSTEM SYNC REPLICA {}".format(table))
@pytest.fixture(scope="module")
def start_cluster():
try:
cluster.start()
fill_nodes([node1, node2, node3], 1)
fill_nodes([node1, node2, node3])
yield cluster
@ -37,11 +43,11 @@ def start_cluster():
def test_recovery(start_cluster):
node1.query("INSERT INTO test_table VALUES (1, 1)")
time.sleep(1)
node1.query("INSERT INTO test_table VALUES (1, 0)")
sync_replicas("test_table")
node2.query("DETACH TABLE test_table")
for i in range(100):
for i in range(1, 11):
node1.query("INSERT INTO test_table VALUES (1, {})".format(i))
node2.query_with_retry("ATTACH TABLE test_table",
@ -51,13 +57,17 @@ def test_recovery(start_cluster):
lost_marker = "Will mark replica node2 as lost"
assert node1.contains_in_log(lost_marker) or node3.contains_in_log(lost_marker)
sync_replicas("test_table")
for node in nodes:
assert node.query("SELECT count(), sum(id) FROM test_table WHERE date=toDate(1)") == "11\t55\n"
def test_choose_source_replica(start_cluster):
node3.query("INSERT INTO test_table VALUES (2, 1)")
time.sleep(1)
node3.query("INSERT INTO test_table VALUES (2, 0)")
sync_replicas("test_table")
node2.query("DETACH TABLE test_table")
node1.query("SYSTEM STOP FETCHES test_table") # node1 will have many entries in queue, so node2 will clone node3
for i in range(100):
for i in range(1, 11):
node3.query("INSERT INTO test_table VALUES (2, {})".format(i))
node2.query_with_retry("ATTACH TABLE test_table",
@ -74,3 +84,81 @@ def test_choose_source_replica(start_cluster):
assert node1.contains_in_log(lost_marker) or node3.contains_in_log(lost_marker)
assert node2.contains_in_log("Will mimic node3")
sync_replicas("test_table")
for node in nodes:
assert node.query("SELECT count(), sum(id) FROM test_table WHERE date=toDate(2)") == "11\t55\n"
def test_update_metadata(start_cluster):
for node in nodes:
node.query(
'''
CREATE TABLE update_metadata(key UInt32)
ENGINE = ReplicatedMergeTree('/test/update_metadata', '{replica}') ORDER BY key PARTITION BY key % 10
{settings};
'''.format(replica=node.name, settings=SETTINGS))
for i in range(1, 11):
node1.query("INSERT INTO update_metadata VALUES ({})".format(i))
node2.query("DETACH TABLE update_metadata")
# alter without mutation
node1.query("ALTER TABLE update_metadata ADD COLUMN col1 UInt32")
for i in range(1, 11):
node1.query("INSERT INTO update_metadata VALUES ({}, {})".format(i * 10, i * 10))
lost_marker = "Will mark replica node2 as lost"
assert node1.contains_in_log(lost_marker) or node3.contains_in_log(lost_marker)
node2.query("ATTACH TABLE update_metadata")
sync_replicas("update_metadata")
assert node1.query("DESC TABLE update_metadata") == node2.query("DESC TABLE update_metadata")
assert node1.query("DESC TABLE update_metadata") == node3.query("DESC TABLE update_metadata")
for node in nodes:
assert node.query("SELECT count(), sum(key), sum(col1) FROM update_metadata") == "20\t605\t550\n"
node2.query("DETACH TABLE update_metadata")
# alter with mutation
node1.query("ALTER TABLE update_metadata DROP COLUMN col1")
for i in range(1, 11):
node1.query("INSERT INTO update_metadata VALUES ({})".format(i * 100))
lost_marker = "Will mark replica node2 as lost"
assert node1.contains_in_log(lost_marker) or node3.contains_in_log(lost_marker)
node2.query("ATTACH TABLE update_metadata")
sync_replicas("update_metadata")
assert node1.query("DESC TABLE update_metadata") == node2.query("DESC TABLE update_metadata")
assert node1.query("DESC TABLE update_metadata") == node3.query("DESC TABLE update_metadata")
# check that it's possible to execute alter on cloned replica
node2.query("ALTER TABLE update_metadata ADD COLUMN col1 UInt32")
sync_replicas("update_metadata")
for node in nodes:
assert node.query("SELECT count(), sum(key), sum(col1) FROM update_metadata") == "30\t6105\t0\n"
# more complex case with multiple alters
node2.query("TRUNCATE TABLE update_metadata")
for i in range(1, 11):
node1.query("INSERT INTO update_metadata VALUES ({}, {})".format(i, i))
# The following alters hang because of "No active replica has part ... or covering part"
#node2.query("SYSTEM STOP REPLICATED SENDS update_metadata")
#node2.query("INSERT INTO update_metadata VALUES (42, 42)") # this part will be lost
node2.query("DETACH TABLE update_metadata")
node1.query("ALTER TABLE update_metadata MODIFY COLUMN col1 String")
node1.query("ALTER TABLE update_metadata ADD COLUMN col2 INT")
for i in range(1, 11):
node3.query("INSERT INTO update_metadata VALUES ({}, '{}', {})".format(i * 10, i * 10, i * 10))
node1.query("ALTER TABLE update_metadata DROP COLUMN col1")
node1.query("ALTER TABLE update_metadata ADD COLUMN col3 Date")
node2.query("ATTACH TABLE update_metadata")
sync_replicas("update_metadata")
assert node1.query("DESC TABLE update_metadata") == node2.query("DESC TABLE update_metadata")
assert node1.query("DESC TABLE update_metadata") == node3.query("DESC TABLE update_metadata")
for node in nodes:
assert node.query("SELECT count(), sum(key), sum(col2) FROM update_metadata") == "20\t605\t550\n"