update metadata on replica recovery

2024-12-17 20:02:05 +00:00 · 2021-04-28 20:49:27 +03:00 · 2021-04-28 20:49:27 +03:00 · bbf3bbc76d
commit bbf3bbc76d
parent a4a2a61eed
3 changed files with 172 additions and 14 deletions
--- a/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h
@ -121,6 +121,7 @@ struct ReplicatedMergeTreeLogEntryData
    int alter_version = -1; /// May be equal to -1, if it's normal mutation, not metadata update.

    /// only ALTER METADATA command
+    /// TODO Seems like it's never used
    bool have_mutation = false; /// If this alter requires additional mutation step, for data update

    String columns_str; /// New columns data corresponding to alter_version
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@ -2530,7 +2530,7 @@ void StorageReplicatedMergeTree::cloneReplica(const String & source_replica, Coo
        /// Let's remember the queue of the reference/master replica.
        source_queue_names = zookeeper->getChildren(source_path + "/queue");

-        /// Check that our log pointer didn't changed while we read queue entries
+        /// Check that log pointer of source replica didn't changed while we read queue entries
        ops.push_back(zkutil::makeCheckRequest(source_path + "/log_pointer", log_pointer_stat.version));

        auto rc = zookeeper->tryMulti(ops, responses);
@ -2576,6 +2576,65 @@ void StorageReplicatedMergeTree::cloneReplica(const String & source_replica, Coo
        source_queue.push_back(entry);
    }

+    /// We should do it after copying queue, because some ALTER_METADATA entries can be lost otherwise.
+    Int32 source_metadata_version = parse<Int32>(zookeeper->get(source_path + "/metadata_version"));
+    if (metadata_version != source_metadata_version)
+    {
+        /// Our metadata it not up to date with source replica metadata.
+        /// Metadata is updated by ALTER_METADATA entries, but some entries are probably cleaned up from the log.
+        /// It's also possible that some newer ALTER_METADATA entries are present in source_queue list,
+        /// and source replica are executing such entry right not (or had executed recently).
+        /// More than that, /metadata_version update is not atomic with /columns and /metadata update...
+
+        /// Fortunately, ALTER_METADATA seems to be idempotent,
+        /// and older entries of such type can be replaced with newer entries.
+        /// Let's try to get consistent values of source replica's /columns and /metadata
+        /// and prepend dummy ALTER_METADATA to our replication queue.
+        /// It should not break anything if source_queue already contains ALTER_METADATA entry
+        /// with greater or equal metadata_version, but it will update our metadata
+        /// if all such entries were cleaned up from the log and source_queue.
+
+        LOG_WARNING(log, "Metadata version ({}) on replica is not up to date with metadata ({}) on source replica {}",
+                    metadata_version, source_metadata_version, source_replica);
+
+        String source_metadata;
+        String source_columns;
+        while (true)
+        {
+            Coordination::Stat metadata_stat;
+            Coordination::Stat columns_stat;
+            source_metadata = zookeeper->get(source_path + "/metadata", &metadata_stat);
+            source_columns = zookeeper->get(source_path + "/columns", &columns_stat);
+
+            Coordination::Requests ops;
+            Coordination::Responses responses;
+            ops.emplace_back(zkutil::makeCheckRequest(source_path + "/metadata", metadata_stat.version));
+            ops.emplace_back(zkutil::makeCheckRequest(source_path + "/columns", columns_stat.version));
+
+            Coordination::Error code = zookeeper->tryMulti(ops, responses);
+            if (code == Coordination::Error::ZOK)
+                break;
+            else if (code == Coordination::Error::ZBADVERSION)
+                LOG_WARNING(log, "Metadata of replica {} was changed", source_path);
+            else
+                zkutil::KeeperMultiException::check(code, ops, responses);
+        }
+
+        ReplicatedMergeTreeLogEntryData dummy_alter;
+        dummy_alter.type = LogEntry::ALTER_METADATA;
+        dummy_alter.source_replica = source_replica;
+        dummy_alter.metadata_str = source_metadata;
+        dummy_alter.columns_str = source_columns;
+        dummy_alter.alter_version = source_metadata_version;
+        dummy_alter.create_time = time(nullptr);
+        /// We don't really know if it has mutation or not, but it doesn't matter.
+        dummy_alter.have_mutation = false;
+
+        zookeeper->create(replica_path + "/queue/queue-", dummy_alter.toString(), zkutil::CreateMode::PersistentSequential);
+
+        /// TODO Do we also need to do something with mutation_pointer? Seems like we don't, but it's not clear from the code.
+    }
+
    /// Add to the queue jobs to receive all the active parts that the reference/master replica has.
    Strings source_replica_parts = zookeeper->getChildren(source_path + "/parts");
    ActiveDataPartSet active_parts_set(format_version, source_replica_parts);
@ -4410,6 +4469,15 @@ bool StorageReplicatedMergeTree::optimize(

 bool StorageReplicatedMergeTree::executeMetadataAlter(const StorageReplicatedMergeTree::LogEntry & entry)
 {
+    if (entry.alter_version < metadata_version)
+    {
+        /// TODO Can we replace it with LOGICAL_ERROR?
+        LOG_WARNING(log, "Attempt to update metadata of version {} "
+                         "to older version {} when processing log entry {}: {}",
+                         metadata_version, entry.alter_version, entry.znode_name, entry.toString());
+        return true;
+    }
+
    auto zookeeper = getZooKeeper();

    auto columns_from_entry = ColumnsDescription::parse(entry.columns_str);
@ -4605,14 +4673,15 @@ void StorageReplicatedMergeTree::alter(

        auto maybe_mutation_commands = commands.getMutationCommands(
            *current_metadata, query_context->getSettingsRef().materialize_ttl_after_modify, query_context);
-        alter_entry->have_mutation = !maybe_mutation_commands.empty();
+        bool have_mutation = !maybe_mutation_commands.empty();
+        alter_entry->have_mutation = have_mutation;

        alter_path_idx = ops.size();
        ops.emplace_back(zkutil::makeCreateRequest(
            zookeeper_path + "/log/log-", alter_entry->toString(), zkutil::CreateMode::PersistentSequential));

        PartitionBlockNumbersHolder partition_block_numbers_holder;
-        if (alter_entry->have_mutation)
+        if (have_mutation)
        {
            const String mutations_path(zookeeper_path + "/mutations");

@ -4657,7 +4726,7 @@ void StorageReplicatedMergeTree::alter(

        if (rc == Coordination::Error::ZOK)
        {
-            if (alter_entry->have_mutation)
+            if (have_mutation)
            {
                /// ALTER_METADATA record in replication /log
                String alter_path = dynamic_cast<const Coordination::CreateResponse &>(*results[alter_path_idx]).path_created;
--- a/tests/integration/test_recovery_replica/test.py
+++ b/tests/integration/test_recovery_replica/test.py
@ -4,28 +4,34 @@ import pytest
 from helpers.cluster import ClickHouseCluster
 from helpers.test_tools import assert_eq_with_retry

+SETTINGS = "SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0"

-def fill_nodes(nodes, shard):
+def fill_nodes(nodes):
    for node in nodes:
        node.query(
            '''
                CREATE TABLE test_table(date Date, id UInt32)
-                ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/replicated', '{replica}') ORDER BY id PARTITION BY toYYYYMM(date) SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0;
-            '''.format(shard=shard, replica=node.name))
+                ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/replicated', '{replica}') ORDER BY id PARTITION BY toYYYYMM(date) 
+                {settings};
+            '''.format(replica=node.name, settings=SETTINGS))


 cluster = ClickHouseCluster(__file__)
 node1 = cluster.add_instance('node1', with_zookeeper=True)
 node2 = cluster.add_instance('node2', with_zookeeper=True)
 node3 = cluster.add_instance('node3', with_zookeeper=True)
+nodes = [node1, node2, node3]

+def sync_replicas(table):
+    for node in nodes:
+        node.query("SYSTEM SYNC REPLICA {}".format(table))

@pytest.fixture(scope="module")
 def start_cluster():
    try:
        cluster.start()

-        fill_nodes([node1, node2, node3], 1)
+        fill_nodes([node1, node2, node3])

        yield cluster

@ -37,11 +43,11 @@ def start_cluster():


 def test_recovery(start_cluster):
-    node1.query("INSERT INTO test_table VALUES (1, 1)")
-    time.sleep(1)
+    node1.query("INSERT INTO test_table VALUES (1, 0)")
+    sync_replicas("test_table")
    node2.query("DETACH TABLE test_table")

-    for i in range(100):
+    for i in range(1, 11):
        node1.query("INSERT INTO test_table VALUES (1, {})".format(i))

    node2.query_with_retry("ATTACH TABLE test_table",
@ -51,13 +57,17 @@ def test_recovery(start_cluster):
    lost_marker = "Will mark replica node2 as lost"
    assert node1.contains_in_log(lost_marker) or node3.contains_in_log(lost_marker)

+    sync_replicas("test_table")
+    for node in nodes:
+        assert node.query("SELECT count(), sum(id) FROM test_table WHERE date=toDate(1)") == "11\t55\n"
+
 def test_choose_source_replica(start_cluster):
-    node3.query("INSERT INTO test_table VALUES (2, 1)")
-    time.sleep(1)
+    node3.query("INSERT INTO test_table VALUES (2, 0)")
+    sync_replicas("test_table")
    node2.query("DETACH TABLE test_table")
    node1.query("SYSTEM STOP FETCHES test_table")   # node1 will have many entries in queue, so node2 will clone node3

-    for i in range(100):
+    for i in range(1, 11):
        node3.query("INSERT INTO test_table VALUES (2, {})".format(i))

    node2.query_with_retry("ATTACH TABLE test_table",
@ -74,3 +84,81 @@ def test_choose_source_replica(start_cluster):
    assert node1.contains_in_log(lost_marker) or node3.contains_in_log(lost_marker)
    assert node2.contains_in_log("Will mimic node3")

+    sync_replicas("test_table")
+    for node in nodes:
+        assert node.query("SELECT count(), sum(id) FROM test_table WHERE date=toDate(2)") == "11\t55\n"
+
+
+def test_update_metadata(start_cluster):
+    for node in nodes:
+        node.query(
+            '''
+                CREATE TABLE update_metadata(key UInt32)
+                ENGINE = ReplicatedMergeTree('/test/update_metadata', '{replica}') ORDER BY key PARTITION BY key % 10
+                {settings};
+            '''.format(replica=node.name, settings=SETTINGS))
+
+    for i in range(1, 11):
+        node1.query("INSERT INTO update_metadata VALUES ({})".format(i))
+
+    node2.query("DETACH TABLE update_metadata")
+    # alter without mutation
+    node1.query("ALTER TABLE update_metadata ADD COLUMN col1 UInt32")
+
+    for i in range(1, 11):
+        node1.query("INSERT INTO update_metadata VALUES ({}, {})".format(i * 10, i * 10))
+
+    lost_marker = "Will mark replica node2 as lost"
+    assert node1.contains_in_log(lost_marker) or node3.contains_in_log(lost_marker)
+
+    node2.query("ATTACH TABLE update_metadata")
+    sync_replicas("update_metadata")
+    assert node1.query("DESC TABLE update_metadata") == node2.query("DESC TABLE update_metadata")
+    assert node1.query("DESC TABLE update_metadata") == node3.query("DESC TABLE update_metadata")
+    for node in nodes:
+        assert node.query("SELECT count(), sum(key), sum(col1) FROM update_metadata") == "20\t605\t550\n"
+
+    node2.query("DETACH TABLE update_metadata")
+    # alter with mutation
+    node1.query("ALTER TABLE update_metadata DROP COLUMN col1")
+    for i in range(1, 11):
+        node1.query("INSERT INTO update_metadata VALUES ({})".format(i * 100))
+
+    lost_marker = "Will mark replica node2 as lost"
+    assert node1.contains_in_log(lost_marker) or node3.contains_in_log(lost_marker)
+
+    node2.query("ATTACH TABLE update_metadata")
+    sync_replicas("update_metadata")
+    assert node1.query("DESC TABLE update_metadata") == node2.query("DESC TABLE update_metadata")
+    assert node1.query("DESC TABLE update_metadata") == node3.query("DESC TABLE update_metadata")
+
+    # check that it's possible to execute alter on cloned replica
+    node2.query("ALTER TABLE update_metadata ADD COLUMN col1 UInt32")
+    sync_replicas("update_metadata")
+    for node in nodes:
+        assert node.query("SELECT count(), sum(key), sum(col1) FROM update_metadata") == "30\t6105\t0\n"
+
+    # more complex case with multiple alters
+    node2.query("TRUNCATE TABLE update_metadata")
+    for i in range(1, 11):
+        node1.query("INSERT INTO update_metadata VALUES ({}, {})".format(i, i))
+
+    # The following alters hang because of "No active replica has part ... or covering part"
+    #node2.query("SYSTEM STOP REPLICATED SENDS update_metadata")
+    #node2.query("INSERT INTO update_metadata VALUES (42, 42)") # this part will be lost
+    node2.query("DETACH TABLE update_metadata")
+
+    node1.query("ALTER TABLE update_metadata MODIFY COLUMN col1 String")
+    node1.query("ALTER TABLE update_metadata ADD COLUMN col2 INT")
+    for i in range(1, 11):
+        node3.query("INSERT INTO update_metadata VALUES ({}, '{}', {})".format(i * 10, i * 10, i * 10))
+    node1.query("ALTER TABLE update_metadata DROP COLUMN col1")
+    node1.query("ALTER TABLE update_metadata ADD COLUMN col3 Date")
+
+    node2.query("ATTACH TABLE update_metadata")
+    sync_replicas("update_metadata")
+    assert node1.query("DESC TABLE update_metadata") == node2.query("DESC TABLE update_metadata")
+    assert node1.query("DESC TABLE update_metadata") == node3.query("DESC TABLE update_metadata")
+    for node in nodes:
+        assert node.query("SELECT count(), sum(key), sum(col2) FROM update_metadata") == "20\t605\t550\n"
+