mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-18 04:12:19 +00:00
update metadata on replica recovery
This commit is contained in:
parent
a4a2a61eed
commit
bbf3bbc76d
@ -121,6 +121,7 @@ struct ReplicatedMergeTreeLogEntryData
|
||||
int alter_version = -1; /// May be equal to -1, if it's normal mutation, not metadata update.
|
||||
|
||||
/// only ALTER METADATA command
|
||||
/// TODO Seems like it's never used
|
||||
bool have_mutation = false; /// If this alter requires additional mutation step, for data update
|
||||
|
||||
String columns_str; /// New columns data corresponding to alter_version
|
||||
|
@ -2530,7 +2530,7 @@ void StorageReplicatedMergeTree::cloneReplica(const String & source_replica, Coo
|
||||
/// Let's remember the queue of the reference/master replica.
|
||||
source_queue_names = zookeeper->getChildren(source_path + "/queue");
|
||||
|
||||
/// Check that our log pointer didn't changed while we read queue entries
|
||||
/// Check that log pointer of source replica didn't changed while we read queue entries
|
||||
ops.push_back(zkutil::makeCheckRequest(source_path + "/log_pointer", log_pointer_stat.version));
|
||||
|
||||
auto rc = zookeeper->tryMulti(ops, responses);
|
||||
@ -2576,6 +2576,65 @@ void StorageReplicatedMergeTree::cloneReplica(const String & source_replica, Coo
|
||||
source_queue.push_back(entry);
|
||||
}
|
||||
|
||||
/// We should do it after copying queue, because some ALTER_METADATA entries can be lost otherwise.
|
||||
Int32 source_metadata_version = parse<Int32>(zookeeper->get(source_path + "/metadata_version"));
|
||||
if (metadata_version != source_metadata_version)
|
||||
{
|
||||
/// Our metadata it not up to date with source replica metadata.
|
||||
/// Metadata is updated by ALTER_METADATA entries, but some entries are probably cleaned up from the log.
|
||||
/// It's also possible that some newer ALTER_METADATA entries are present in source_queue list,
|
||||
/// and source replica are executing such entry right not (or had executed recently).
|
||||
/// More than that, /metadata_version update is not atomic with /columns and /metadata update...
|
||||
|
||||
/// Fortunately, ALTER_METADATA seems to be idempotent,
|
||||
/// and older entries of such type can be replaced with newer entries.
|
||||
/// Let's try to get consistent values of source replica's /columns and /metadata
|
||||
/// and prepend dummy ALTER_METADATA to our replication queue.
|
||||
/// It should not break anything if source_queue already contains ALTER_METADATA entry
|
||||
/// with greater or equal metadata_version, but it will update our metadata
|
||||
/// if all such entries were cleaned up from the log and source_queue.
|
||||
|
||||
LOG_WARNING(log, "Metadata version ({}) on replica is not up to date with metadata ({}) on source replica {}",
|
||||
metadata_version, source_metadata_version, source_replica);
|
||||
|
||||
String source_metadata;
|
||||
String source_columns;
|
||||
while (true)
|
||||
{
|
||||
Coordination::Stat metadata_stat;
|
||||
Coordination::Stat columns_stat;
|
||||
source_metadata = zookeeper->get(source_path + "/metadata", &metadata_stat);
|
||||
source_columns = zookeeper->get(source_path + "/columns", &columns_stat);
|
||||
|
||||
Coordination::Requests ops;
|
||||
Coordination::Responses responses;
|
||||
ops.emplace_back(zkutil::makeCheckRequest(source_path + "/metadata", metadata_stat.version));
|
||||
ops.emplace_back(zkutil::makeCheckRequest(source_path + "/columns", columns_stat.version));
|
||||
|
||||
Coordination::Error code = zookeeper->tryMulti(ops, responses);
|
||||
if (code == Coordination::Error::ZOK)
|
||||
break;
|
||||
else if (code == Coordination::Error::ZBADVERSION)
|
||||
LOG_WARNING(log, "Metadata of replica {} was changed", source_path);
|
||||
else
|
||||
zkutil::KeeperMultiException::check(code, ops, responses);
|
||||
}
|
||||
|
||||
ReplicatedMergeTreeLogEntryData dummy_alter;
|
||||
dummy_alter.type = LogEntry::ALTER_METADATA;
|
||||
dummy_alter.source_replica = source_replica;
|
||||
dummy_alter.metadata_str = source_metadata;
|
||||
dummy_alter.columns_str = source_columns;
|
||||
dummy_alter.alter_version = source_metadata_version;
|
||||
dummy_alter.create_time = time(nullptr);
|
||||
/// We don't really know if it has mutation or not, but it doesn't matter.
|
||||
dummy_alter.have_mutation = false;
|
||||
|
||||
zookeeper->create(replica_path + "/queue/queue-", dummy_alter.toString(), zkutil::CreateMode::PersistentSequential);
|
||||
|
||||
/// TODO Do we also need to do something with mutation_pointer? Seems like we don't, but it's not clear from the code.
|
||||
}
|
||||
|
||||
/// Add to the queue jobs to receive all the active parts that the reference/master replica has.
|
||||
Strings source_replica_parts = zookeeper->getChildren(source_path + "/parts");
|
||||
ActiveDataPartSet active_parts_set(format_version, source_replica_parts);
|
||||
@ -4410,6 +4469,15 @@ bool StorageReplicatedMergeTree::optimize(
|
||||
|
||||
bool StorageReplicatedMergeTree::executeMetadataAlter(const StorageReplicatedMergeTree::LogEntry & entry)
|
||||
{
|
||||
if (entry.alter_version < metadata_version)
|
||||
{
|
||||
/// TODO Can we replace it with LOGICAL_ERROR?
|
||||
LOG_WARNING(log, "Attempt to update metadata of version {} "
|
||||
"to older version {} when processing log entry {}: {}",
|
||||
metadata_version, entry.alter_version, entry.znode_name, entry.toString());
|
||||
return true;
|
||||
}
|
||||
|
||||
auto zookeeper = getZooKeeper();
|
||||
|
||||
auto columns_from_entry = ColumnsDescription::parse(entry.columns_str);
|
||||
@ -4605,14 +4673,15 @@ void StorageReplicatedMergeTree::alter(
|
||||
|
||||
auto maybe_mutation_commands = commands.getMutationCommands(
|
||||
*current_metadata, query_context->getSettingsRef().materialize_ttl_after_modify, query_context);
|
||||
alter_entry->have_mutation = !maybe_mutation_commands.empty();
|
||||
bool have_mutation = !maybe_mutation_commands.empty();
|
||||
alter_entry->have_mutation = have_mutation;
|
||||
|
||||
alter_path_idx = ops.size();
|
||||
ops.emplace_back(zkutil::makeCreateRequest(
|
||||
zookeeper_path + "/log/log-", alter_entry->toString(), zkutil::CreateMode::PersistentSequential));
|
||||
|
||||
PartitionBlockNumbersHolder partition_block_numbers_holder;
|
||||
if (alter_entry->have_mutation)
|
||||
if (have_mutation)
|
||||
{
|
||||
const String mutations_path(zookeeper_path + "/mutations");
|
||||
|
||||
@ -4657,7 +4726,7 @@ void StorageReplicatedMergeTree::alter(
|
||||
|
||||
if (rc == Coordination::Error::ZOK)
|
||||
{
|
||||
if (alter_entry->have_mutation)
|
||||
if (have_mutation)
|
||||
{
|
||||
/// ALTER_METADATA record in replication /log
|
||||
String alter_path = dynamic_cast<const Coordination::CreateResponse &>(*results[alter_path_idx]).path_created;
|
||||
|
@ -4,28 +4,34 @@ import pytest
|
||||
from helpers.cluster import ClickHouseCluster
|
||||
from helpers.test_tools import assert_eq_with_retry
|
||||
|
||||
SETTINGS = "SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0"
|
||||
|
||||
def fill_nodes(nodes, shard):
|
||||
def fill_nodes(nodes):
|
||||
for node in nodes:
|
||||
node.query(
|
||||
'''
|
||||
CREATE TABLE test_table(date Date, id UInt32)
|
||||
ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/replicated', '{replica}') ORDER BY id PARTITION BY toYYYYMM(date) SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0;
|
||||
'''.format(shard=shard, replica=node.name))
|
||||
ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/replicated', '{replica}') ORDER BY id PARTITION BY toYYYYMM(date)
|
||||
{settings};
|
||||
'''.format(replica=node.name, settings=SETTINGS))
|
||||
|
||||
|
||||
cluster = ClickHouseCluster(__file__)
|
||||
node1 = cluster.add_instance('node1', with_zookeeper=True)
|
||||
node2 = cluster.add_instance('node2', with_zookeeper=True)
|
||||
node3 = cluster.add_instance('node3', with_zookeeper=True)
|
||||
nodes = [node1, node2, node3]
|
||||
|
||||
def sync_replicas(table):
|
||||
for node in nodes:
|
||||
node.query("SYSTEM SYNC REPLICA {}".format(table))
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def start_cluster():
|
||||
try:
|
||||
cluster.start()
|
||||
|
||||
fill_nodes([node1, node2, node3], 1)
|
||||
fill_nodes([node1, node2, node3])
|
||||
|
||||
yield cluster
|
||||
|
||||
@ -37,11 +43,11 @@ def start_cluster():
|
||||
|
||||
|
||||
def test_recovery(start_cluster):
|
||||
node1.query("INSERT INTO test_table VALUES (1, 1)")
|
||||
time.sleep(1)
|
||||
node1.query("INSERT INTO test_table VALUES (1, 0)")
|
||||
sync_replicas("test_table")
|
||||
node2.query("DETACH TABLE test_table")
|
||||
|
||||
for i in range(100):
|
||||
for i in range(1, 11):
|
||||
node1.query("INSERT INTO test_table VALUES (1, {})".format(i))
|
||||
|
||||
node2.query_with_retry("ATTACH TABLE test_table",
|
||||
@ -51,13 +57,17 @@ def test_recovery(start_cluster):
|
||||
lost_marker = "Will mark replica node2 as lost"
|
||||
assert node1.contains_in_log(lost_marker) or node3.contains_in_log(lost_marker)
|
||||
|
||||
sync_replicas("test_table")
|
||||
for node in nodes:
|
||||
assert node.query("SELECT count(), sum(id) FROM test_table WHERE date=toDate(1)") == "11\t55\n"
|
||||
|
||||
def test_choose_source_replica(start_cluster):
|
||||
node3.query("INSERT INTO test_table VALUES (2, 1)")
|
||||
time.sleep(1)
|
||||
node3.query("INSERT INTO test_table VALUES (2, 0)")
|
||||
sync_replicas("test_table")
|
||||
node2.query("DETACH TABLE test_table")
|
||||
node1.query("SYSTEM STOP FETCHES test_table") # node1 will have many entries in queue, so node2 will clone node3
|
||||
|
||||
for i in range(100):
|
||||
for i in range(1, 11):
|
||||
node3.query("INSERT INTO test_table VALUES (2, {})".format(i))
|
||||
|
||||
node2.query_with_retry("ATTACH TABLE test_table",
|
||||
@ -74,3 +84,81 @@ def test_choose_source_replica(start_cluster):
|
||||
assert node1.contains_in_log(lost_marker) or node3.contains_in_log(lost_marker)
|
||||
assert node2.contains_in_log("Will mimic node3")
|
||||
|
||||
sync_replicas("test_table")
|
||||
for node in nodes:
|
||||
assert node.query("SELECT count(), sum(id) FROM test_table WHERE date=toDate(2)") == "11\t55\n"
|
||||
|
||||
|
||||
def test_update_metadata(start_cluster):
|
||||
for node in nodes:
|
||||
node.query(
|
||||
'''
|
||||
CREATE TABLE update_metadata(key UInt32)
|
||||
ENGINE = ReplicatedMergeTree('/test/update_metadata', '{replica}') ORDER BY key PARTITION BY key % 10
|
||||
{settings};
|
||||
'''.format(replica=node.name, settings=SETTINGS))
|
||||
|
||||
for i in range(1, 11):
|
||||
node1.query("INSERT INTO update_metadata VALUES ({})".format(i))
|
||||
|
||||
node2.query("DETACH TABLE update_metadata")
|
||||
# alter without mutation
|
||||
node1.query("ALTER TABLE update_metadata ADD COLUMN col1 UInt32")
|
||||
|
||||
for i in range(1, 11):
|
||||
node1.query("INSERT INTO update_metadata VALUES ({}, {})".format(i * 10, i * 10))
|
||||
|
||||
lost_marker = "Will mark replica node2 as lost"
|
||||
assert node1.contains_in_log(lost_marker) or node3.contains_in_log(lost_marker)
|
||||
|
||||
node2.query("ATTACH TABLE update_metadata")
|
||||
sync_replicas("update_metadata")
|
||||
assert node1.query("DESC TABLE update_metadata") == node2.query("DESC TABLE update_metadata")
|
||||
assert node1.query("DESC TABLE update_metadata") == node3.query("DESC TABLE update_metadata")
|
||||
for node in nodes:
|
||||
assert node.query("SELECT count(), sum(key), sum(col1) FROM update_metadata") == "20\t605\t550\n"
|
||||
|
||||
node2.query("DETACH TABLE update_metadata")
|
||||
# alter with mutation
|
||||
node1.query("ALTER TABLE update_metadata DROP COLUMN col1")
|
||||
for i in range(1, 11):
|
||||
node1.query("INSERT INTO update_metadata VALUES ({})".format(i * 100))
|
||||
|
||||
lost_marker = "Will mark replica node2 as lost"
|
||||
assert node1.contains_in_log(lost_marker) or node3.contains_in_log(lost_marker)
|
||||
|
||||
node2.query("ATTACH TABLE update_metadata")
|
||||
sync_replicas("update_metadata")
|
||||
assert node1.query("DESC TABLE update_metadata") == node2.query("DESC TABLE update_metadata")
|
||||
assert node1.query("DESC TABLE update_metadata") == node3.query("DESC TABLE update_metadata")
|
||||
|
||||
# check that it's possible to execute alter on cloned replica
|
||||
node2.query("ALTER TABLE update_metadata ADD COLUMN col1 UInt32")
|
||||
sync_replicas("update_metadata")
|
||||
for node in nodes:
|
||||
assert node.query("SELECT count(), sum(key), sum(col1) FROM update_metadata") == "30\t6105\t0\n"
|
||||
|
||||
# more complex case with multiple alters
|
||||
node2.query("TRUNCATE TABLE update_metadata")
|
||||
for i in range(1, 11):
|
||||
node1.query("INSERT INTO update_metadata VALUES ({}, {})".format(i, i))
|
||||
|
||||
# The following alters hang because of "No active replica has part ... or covering part"
|
||||
#node2.query("SYSTEM STOP REPLICATED SENDS update_metadata")
|
||||
#node2.query("INSERT INTO update_metadata VALUES (42, 42)") # this part will be lost
|
||||
node2.query("DETACH TABLE update_metadata")
|
||||
|
||||
node1.query("ALTER TABLE update_metadata MODIFY COLUMN col1 String")
|
||||
node1.query("ALTER TABLE update_metadata ADD COLUMN col2 INT")
|
||||
for i in range(1, 11):
|
||||
node3.query("INSERT INTO update_metadata VALUES ({}, '{}', {})".format(i * 10, i * 10, i * 10))
|
||||
node1.query("ALTER TABLE update_metadata DROP COLUMN col1")
|
||||
node1.query("ALTER TABLE update_metadata ADD COLUMN col3 Date")
|
||||
|
||||
node2.query("ATTACH TABLE update_metadata")
|
||||
sync_replicas("update_metadata")
|
||||
assert node1.query("DESC TABLE update_metadata") == node2.query("DESC TABLE update_metadata")
|
||||
assert node1.query("DESC TABLE update_metadata") == node3.query("DESC TABLE update_metadata")
|
||||
for node in nodes:
|
||||
assert node.query("SELECT count(), sum(key), sum(col2) FROM update_metadata") == "20\t605\t550\n"
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user