From bbf3bbc76d5309163004aa4f98fd3bbe46dd03fe Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Wed, 28 Apr 2021 20:49:27 +0300 Subject: [PATCH] update metadata on replica recovery --- .../MergeTree/ReplicatedMergeTreeLogEntry.h | 1 + src/Storages/StorageReplicatedMergeTree.cpp | 77 ++++++++++++- .../integration/test_recovery_replica/test.py | 108 ++++++++++++++++-- 3 files changed, 172 insertions(+), 14 deletions(-) diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h b/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h index 309120560e7..23743523cbf 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h @@ -121,6 +121,7 @@ struct ReplicatedMergeTreeLogEntryData int alter_version = -1; /// May be equal to -1, if it's normal mutation, not metadata update. /// only ALTER METADATA command + /// TODO Seems like it's never used bool have_mutation = false; /// If this alter requires additional mutation step, for data update String columns_str; /// New columns data corresponding to alter_version diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 3b4a1ec4e16..c1932cb0fd5 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -2530,7 +2530,7 @@ void StorageReplicatedMergeTree::cloneReplica(const String & source_replica, Coo /// Let's remember the queue of the reference/master replica. source_queue_names = zookeeper->getChildren(source_path + "/queue"); - /// Check that our log pointer didn't changed while we read queue entries + /// Check that log pointer of source replica didn't changed while we read queue entries ops.push_back(zkutil::makeCheckRequest(source_path + "/log_pointer", log_pointer_stat.version)); auto rc = zookeeper->tryMulti(ops, responses); @@ -2576,6 +2576,65 @@ void StorageReplicatedMergeTree::cloneReplica(const String & source_replica, Coo source_queue.push_back(entry); } + /// We should do it after copying queue, because some ALTER_METADATA entries can be lost otherwise. + Int32 source_metadata_version = parse(zookeeper->get(source_path + "/metadata_version")); + if (metadata_version != source_metadata_version) + { + /// Our metadata it not up to date with source replica metadata. + /// Metadata is updated by ALTER_METADATA entries, but some entries are probably cleaned up from the log. + /// It's also possible that some newer ALTER_METADATA entries are present in source_queue list, + /// and source replica are executing such entry right not (or had executed recently). + /// More than that, /metadata_version update is not atomic with /columns and /metadata update... + + /// Fortunately, ALTER_METADATA seems to be idempotent, + /// and older entries of such type can be replaced with newer entries. + /// Let's try to get consistent values of source replica's /columns and /metadata + /// and prepend dummy ALTER_METADATA to our replication queue. + /// It should not break anything if source_queue already contains ALTER_METADATA entry + /// with greater or equal metadata_version, but it will update our metadata + /// if all such entries were cleaned up from the log and source_queue. + + LOG_WARNING(log, "Metadata version ({}) on replica is not up to date with metadata ({}) on source replica {}", + metadata_version, source_metadata_version, source_replica); + + String source_metadata; + String source_columns; + while (true) + { + Coordination::Stat metadata_stat; + Coordination::Stat columns_stat; + source_metadata = zookeeper->get(source_path + "/metadata", &metadata_stat); + source_columns = zookeeper->get(source_path + "/columns", &columns_stat); + + Coordination::Requests ops; + Coordination::Responses responses; + ops.emplace_back(zkutil::makeCheckRequest(source_path + "/metadata", metadata_stat.version)); + ops.emplace_back(zkutil::makeCheckRequest(source_path + "/columns", columns_stat.version)); + + Coordination::Error code = zookeeper->tryMulti(ops, responses); + if (code == Coordination::Error::ZOK) + break; + else if (code == Coordination::Error::ZBADVERSION) + LOG_WARNING(log, "Metadata of replica {} was changed", source_path); + else + zkutil::KeeperMultiException::check(code, ops, responses); + } + + ReplicatedMergeTreeLogEntryData dummy_alter; + dummy_alter.type = LogEntry::ALTER_METADATA; + dummy_alter.source_replica = source_replica; + dummy_alter.metadata_str = source_metadata; + dummy_alter.columns_str = source_columns; + dummy_alter.alter_version = source_metadata_version; + dummy_alter.create_time = time(nullptr); + /// We don't really know if it has mutation or not, but it doesn't matter. + dummy_alter.have_mutation = false; + + zookeeper->create(replica_path + "/queue/queue-", dummy_alter.toString(), zkutil::CreateMode::PersistentSequential); + + /// TODO Do we also need to do something with mutation_pointer? Seems like we don't, but it's not clear from the code. + } + /// Add to the queue jobs to receive all the active parts that the reference/master replica has. Strings source_replica_parts = zookeeper->getChildren(source_path + "/parts"); ActiveDataPartSet active_parts_set(format_version, source_replica_parts); @@ -4410,6 +4469,15 @@ bool StorageReplicatedMergeTree::optimize( bool StorageReplicatedMergeTree::executeMetadataAlter(const StorageReplicatedMergeTree::LogEntry & entry) { + if (entry.alter_version < metadata_version) + { + /// TODO Can we replace it with LOGICAL_ERROR? + LOG_WARNING(log, "Attempt to update metadata of version {} " + "to older version {} when processing log entry {}: {}", + metadata_version, entry.alter_version, entry.znode_name, entry.toString()); + return true; + } + auto zookeeper = getZooKeeper(); auto columns_from_entry = ColumnsDescription::parse(entry.columns_str); @@ -4605,14 +4673,15 @@ void StorageReplicatedMergeTree::alter( auto maybe_mutation_commands = commands.getMutationCommands( *current_metadata, query_context->getSettingsRef().materialize_ttl_after_modify, query_context); - alter_entry->have_mutation = !maybe_mutation_commands.empty(); + bool have_mutation = !maybe_mutation_commands.empty(); + alter_entry->have_mutation = have_mutation; alter_path_idx = ops.size(); ops.emplace_back(zkutil::makeCreateRequest( zookeeper_path + "/log/log-", alter_entry->toString(), zkutil::CreateMode::PersistentSequential)); PartitionBlockNumbersHolder partition_block_numbers_holder; - if (alter_entry->have_mutation) + if (have_mutation) { const String mutations_path(zookeeper_path + "/mutations"); @@ -4657,7 +4726,7 @@ void StorageReplicatedMergeTree::alter( if (rc == Coordination::Error::ZOK) { - if (alter_entry->have_mutation) + if (have_mutation) { /// ALTER_METADATA record in replication /log String alter_path = dynamic_cast(*results[alter_path_idx]).path_created; diff --git a/tests/integration/test_recovery_replica/test.py b/tests/integration/test_recovery_replica/test.py index 22dba7aafec..bf869d0de31 100644 --- a/tests/integration/test_recovery_replica/test.py +++ b/tests/integration/test_recovery_replica/test.py @@ -4,28 +4,34 @@ import pytest from helpers.cluster import ClickHouseCluster from helpers.test_tools import assert_eq_with_retry +SETTINGS = "SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0" -def fill_nodes(nodes, shard): +def fill_nodes(nodes): for node in nodes: node.query( ''' CREATE TABLE test_table(date Date, id UInt32) - ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/replicated', '{replica}') ORDER BY id PARTITION BY toYYYYMM(date) SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0; - '''.format(shard=shard, replica=node.name)) + ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/replicated', '{replica}') ORDER BY id PARTITION BY toYYYYMM(date) + {settings}; + '''.format(replica=node.name, settings=SETTINGS)) cluster = ClickHouseCluster(__file__) node1 = cluster.add_instance('node1', with_zookeeper=True) node2 = cluster.add_instance('node2', with_zookeeper=True) node3 = cluster.add_instance('node3', with_zookeeper=True) +nodes = [node1, node2, node3] +def sync_replicas(table): + for node in nodes: + node.query("SYSTEM SYNC REPLICA {}".format(table)) @pytest.fixture(scope="module") def start_cluster(): try: cluster.start() - fill_nodes([node1, node2, node3], 1) + fill_nodes([node1, node2, node3]) yield cluster @@ -37,11 +43,11 @@ def start_cluster(): def test_recovery(start_cluster): - node1.query("INSERT INTO test_table VALUES (1, 1)") - time.sleep(1) + node1.query("INSERT INTO test_table VALUES (1, 0)") + sync_replicas("test_table") node2.query("DETACH TABLE test_table") - for i in range(100): + for i in range(1, 11): node1.query("INSERT INTO test_table VALUES (1, {})".format(i)) node2.query_with_retry("ATTACH TABLE test_table", @@ -51,13 +57,17 @@ def test_recovery(start_cluster): lost_marker = "Will mark replica node2 as lost" assert node1.contains_in_log(lost_marker) or node3.contains_in_log(lost_marker) + sync_replicas("test_table") + for node in nodes: + assert node.query("SELECT count(), sum(id) FROM test_table WHERE date=toDate(1)") == "11\t55\n" + def test_choose_source_replica(start_cluster): - node3.query("INSERT INTO test_table VALUES (2, 1)") - time.sleep(1) + node3.query("INSERT INTO test_table VALUES (2, 0)") + sync_replicas("test_table") node2.query("DETACH TABLE test_table") node1.query("SYSTEM STOP FETCHES test_table") # node1 will have many entries in queue, so node2 will clone node3 - for i in range(100): + for i in range(1, 11): node3.query("INSERT INTO test_table VALUES (2, {})".format(i)) node2.query_with_retry("ATTACH TABLE test_table", @@ -74,3 +84,81 @@ def test_choose_source_replica(start_cluster): assert node1.contains_in_log(lost_marker) or node3.contains_in_log(lost_marker) assert node2.contains_in_log("Will mimic node3") + sync_replicas("test_table") + for node in nodes: + assert node.query("SELECT count(), sum(id) FROM test_table WHERE date=toDate(2)") == "11\t55\n" + + +def test_update_metadata(start_cluster): + for node in nodes: + node.query( + ''' + CREATE TABLE update_metadata(key UInt32) + ENGINE = ReplicatedMergeTree('/test/update_metadata', '{replica}') ORDER BY key PARTITION BY key % 10 + {settings}; + '''.format(replica=node.name, settings=SETTINGS)) + + for i in range(1, 11): + node1.query("INSERT INTO update_metadata VALUES ({})".format(i)) + + node2.query("DETACH TABLE update_metadata") + # alter without mutation + node1.query("ALTER TABLE update_metadata ADD COLUMN col1 UInt32") + + for i in range(1, 11): + node1.query("INSERT INTO update_metadata VALUES ({}, {})".format(i * 10, i * 10)) + + lost_marker = "Will mark replica node2 as lost" + assert node1.contains_in_log(lost_marker) or node3.contains_in_log(lost_marker) + + node2.query("ATTACH TABLE update_metadata") + sync_replicas("update_metadata") + assert node1.query("DESC TABLE update_metadata") == node2.query("DESC TABLE update_metadata") + assert node1.query("DESC TABLE update_metadata") == node3.query("DESC TABLE update_metadata") + for node in nodes: + assert node.query("SELECT count(), sum(key), sum(col1) FROM update_metadata") == "20\t605\t550\n" + + node2.query("DETACH TABLE update_metadata") + # alter with mutation + node1.query("ALTER TABLE update_metadata DROP COLUMN col1") + for i in range(1, 11): + node1.query("INSERT INTO update_metadata VALUES ({})".format(i * 100)) + + lost_marker = "Will mark replica node2 as lost" + assert node1.contains_in_log(lost_marker) or node3.contains_in_log(lost_marker) + + node2.query("ATTACH TABLE update_metadata") + sync_replicas("update_metadata") + assert node1.query("DESC TABLE update_metadata") == node2.query("DESC TABLE update_metadata") + assert node1.query("DESC TABLE update_metadata") == node3.query("DESC TABLE update_metadata") + + # check that it's possible to execute alter on cloned replica + node2.query("ALTER TABLE update_metadata ADD COLUMN col1 UInt32") + sync_replicas("update_metadata") + for node in nodes: + assert node.query("SELECT count(), sum(key), sum(col1) FROM update_metadata") == "30\t6105\t0\n" + + # more complex case with multiple alters + node2.query("TRUNCATE TABLE update_metadata") + for i in range(1, 11): + node1.query("INSERT INTO update_metadata VALUES ({}, {})".format(i, i)) + + # The following alters hang because of "No active replica has part ... or covering part" + #node2.query("SYSTEM STOP REPLICATED SENDS update_metadata") + #node2.query("INSERT INTO update_metadata VALUES (42, 42)") # this part will be lost + node2.query("DETACH TABLE update_metadata") + + node1.query("ALTER TABLE update_metadata MODIFY COLUMN col1 String") + node1.query("ALTER TABLE update_metadata ADD COLUMN col2 INT") + for i in range(1, 11): + node3.query("INSERT INTO update_metadata VALUES ({}, '{}', {})".format(i * 10, i * 10, i * 10)) + node1.query("ALTER TABLE update_metadata DROP COLUMN col1") + node1.query("ALTER TABLE update_metadata ADD COLUMN col3 Date") + + node2.query("ATTACH TABLE update_metadata") + sync_replicas("update_metadata") + assert node1.query("DESC TABLE update_metadata") == node2.query("DESC TABLE update_metadata") + assert node1.query("DESC TABLE update_metadata") == node3.query("DESC TABLE update_metadata") + for node in nodes: + assert node.query("SELECT count(), sum(key), sum(col2) FROM update_metadata") == "20\t605\t550\n" +