ClickHouse/src/Databases/DatabaseReplicated.cpp

390 lines
15 KiB
C++
Raw Normal View History

2020-10-22 15:08:00 +00:00
#include <DataTypes/DataTypeString.h>
2020-04-05 12:18:51 +00:00
#include <Databases/DatabaseReplicated.h>
#include <IO/ReadBufferFromFile.h>
2020-05-11 12:55:17 +00:00
#include <IO/ReadBufferFromString.h>
2020-04-05 12:18:51 +00:00
#include <IO/ReadHelpers.h>
#include <IO/WriteBufferFromFile.h>
#include <IO/WriteHelpers.h>
#include <Interpreters/Context.h>
2020-05-11 12:55:17 +00:00
#include <Interpreters/executeQuery.h>
2020-04-05 12:18:51 +00:00
#include <Parsers/queryToString.h>
2020-05-12 13:35:05 +00:00
#include <Common/Exception.h>
2020-10-22 15:08:00 +00:00
#include <Common/Stopwatch.h>
2020-04-05 12:18:51 +00:00
#include <Common/ZooKeeper/KeeperException.h>
#include <Common/ZooKeeper/Types.h>
#include <Common/ZooKeeper/ZooKeeper.h>
#include <Interpreters/DDLWorker.h>
#include <Interpreters/DDLTask.h>
#include <Interpreters/executeDDLQueryOnCluster.h>
2020-11-13 18:35:45 +00:00
#include <Interpreters/Cluster.h>
#include <common/getFQDNOrHostName.h>
#include <Parsers/ASTAlterQuery.h>
2020-04-05 12:18:51 +00:00
namespace DB
{
namespace ErrorCodes
{
extern const int NO_ZOOKEEPER;
extern const int LOGICAL_ERROR;
2020-06-24 12:45:42 +00:00
extern const int BAD_ARGUMENTS;
2020-11-13 18:35:45 +00:00
extern const int REPLICA_IS_ALREADY_EXIST;
2020-04-05 12:18:51 +00:00
}
2020-11-13 18:35:45 +00:00
constexpr const char * first_entry_name = "query-0000000000";
2020-04-05 12:18:51 +00:00
2020-11-13 18:35:45 +00:00
zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const
2020-04-05 12:18:51 +00:00
{
2020-11-13 18:35:45 +00:00
return global_context.getZooKeeper();
2020-04-05 12:18:51 +00:00
}
2020-11-13 18:35:45 +00:00
static inline String getHostID(const Context & global_context)
2020-04-05 12:18:51 +00:00
{
2020-11-13 18:35:45 +00:00
return Cluster::Address::toString(getFQDNOrHostName(), global_context.getTCPPort());
2020-04-05 12:18:51 +00:00
}
2020-11-13 18:35:45 +00:00
DatabaseReplicated::~DatabaseReplicated() = default;
2020-04-05 12:18:51 +00:00
DatabaseReplicated::DatabaseReplicated(
const String & name_,
const String & metadata_path_,
UUID uuid,
2020-04-05 12:18:51 +00:00
const String & zookeeper_path_,
2020-10-27 09:19:45 +00:00
const String & shard_name_,
2020-04-05 12:18:51 +00:00
const String & replica_name_,
Context & context_)
: DatabaseAtomic(name_, metadata_path_, uuid, "DatabaseReplicated (" + name_ + ")", context_)
2020-04-05 12:18:51 +00:00
, zookeeper_path(zookeeper_path_)
2020-10-27 09:19:45 +00:00
, shard_name(shard_name_)
2020-04-05 12:18:51 +00:00
, replica_name(replica_name_)
{
2020-10-27 09:19:45 +00:00
if (zookeeper_path.empty() || shard_name.empty() || replica_name.empty())
2020-11-13 18:35:45 +00:00
throw Exception("ZooKeeper path, shard and replica names must be non-empty", ErrorCodes::BAD_ARGUMENTS);
if (shard_name.find('/') != std::string::npos || replica_name.find('/') != std::string::npos)
throw Exception("Shard and replica names should not contain '/'", ErrorCodes::BAD_ARGUMENTS);
2020-06-24 12:45:42 +00:00
if (zookeeper_path.back() == '/')
2020-04-05 12:18:51 +00:00
zookeeper_path.resize(zookeeper_path.size() - 1);
2020-11-13 18:35:45 +00:00
2020-10-22 15:08:00 +00:00
/// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it.
2020-06-24 12:45:42 +00:00
if (zookeeper_path.front() != '/')
2020-04-05 12:18:51 +00:00
zookeeper_path = "/" + zookeeper_path;
2020-11-13 18:35:45 +00:00
if (!context_.hasZooKeeper())
2020-04-05 12:18:51 +00:00
{
2020-10-22 15:08:00 +00:00
throw Exception("Can't create replicated database without ZooKeeper", ErrorCodes::NO_ZOOKEEPER);
}
2020-11-13 18:35:45 +00:00
//FIXME it will fail on startup if zk is not available
auto current_zookeeper = global_context.getZooKeeper();
2020-04-05 12:18:51 +00:00
2020-10-22 15:08:00 +00:00
if (!current_zookeeper->exists(zookeeper_path))
{
2020-11-13 18:35:45 +00:00
/// Create new database, multiple nodes can execute it concurrently
createDatabaseNodesInZooKeeper(current_zookeeper);
2020-10-22 15:08:00 +00:00
}
2020-10-27 09:19:45 +00:00
2020-11-13 18:35:45 +00:00
replica_path = zookeeper_path + "/replicas/" + shard_name + "|" + replica_name;
2020-11-13 18:35:45 +00:00
String replica_host_id;
if (current_zookeeper->tryGet(replica_path, replica_host_id))
{
String host_id = getHostID(global_context);
if (replica_host_id != host_id)
throw Exception(ErrorCodes::REPLICA_IS_ALREADY_EXIST,
"Replica {} of shard {} of replicated database at {} already exists. Replica host ID: '{}', current host ID: '{}'",
replica_name, shard_name, zookeeper_path, replica_host_id, host_id);
2020-05-24 17:13:53 +00:00
2020-11-13 18:35:45 +00:00
log_entry_to_execute = current_zookeeper->get(replica_path + "/log_ptr");
}
2020-10-27 09:19:45 +00:00
else
{
2020-11-13 18:35:45 +00:00
/// Throws if replica with the same name was created concurrently
createReplicaNodesInZooKeeper(current_zookeeper);
2020-10-27 09:19:45 +00:00
}
2020-05-11 12:55:17 +00:00
2020-11-13 18:35:45 +00:00
assert(log_entry_to_execute.starts_with("query-"));
2020-06-20 15:39:58 +00:00
snapshot_period = context_.getConfigRef().getInt("database_replicated_snapshot_period", 10);
2020-06-27 13:39:41 +00:00
LOG_DEBUG(log, "Snapshot period is set to {} log entries per one snapshot", snapshot_period);
2020-11-13 18:35:45 +00:00
}
2020-05-24 17:13:53 +00:00
2020-11-13 18:35:45 +00:00
bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper)
{
current_zookeeper->createAncestors(zookeeper_path);
2020-06-20 15:39:58 +00:00
2020-11-13 18:35:45 +00:00
Coordination::Requests ops;
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log", "", zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/snapshots", "", zkutil::CreateMode::Persistent));
/// Create empty snapshot (with no tables)
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/snapshots/" + first_entry_name, "", zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", zkutil::CreateMode::Persistent));
Coordination::Responses responses;
auto res = current_zookeeper->tryMulti(ops, responses);
if (res == Coordination::Error::ZOK)
return true;
if (res == Coordination::Error::ZNODEEXISTS)
return false;
zkutil::KeeperMultiException::check(res, ops, responses);
assert(false);
2020-05-11 12:55:17 +00:00
}
2020-11-13 18:35:45 +00:00
void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper)
2020-10-22 15:08:00 +00:00
{
2020-11-13 18:35:45 +00:00
current_zookeeper->createAncestors(replica_path);
2020-05-24 17:13:53 +00:00
2020-11-13 18:35:45 +00:00
Strings snapshots = current_zookeeper->getChildren(zookeeper_path + "/snapshots");
std::sort(snapshots.begin(), snapshots.end());
if (snapshots.empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "No snapshots found");
2020-05-24 17:13:53 +00:00
2020-11-13 18:35:45 +00:00
/// When creating new replica, use latest snapshot version as initial value of log_pointer
log_entry_to_execute = snapshots.back();
/// Write host name to replica_path, it will protect from multiple replicas with the same name
auto host_id = getHostID(global_context);
Coordination::Requests ops;
ops.emplace_back(zkutil::makeCreateRequest(replica_path, host_id, zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/log_ptr", log_entry_to_execute , zkutil::CreateMode::Persistent));
current_zookeeper->multi(ops);
2020-06-20 15:39:58 +00:00
}
2020-11-13 18:35:45 +00:00
void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach)
2020-10-27 09:19:45 +00:00
{
2020-11-13 18:35:45 +00:00
DatabaseAtomic::loadStoredObjects(context, has_force_restore_data_flag, force_attach);
DatabaseReplicatedExtensions ext;
ext.database_uuid = getUUID();
ext.database_name = getDatabaseName();
ext.shard_name = shard_name;
ext.replica_name = replica_name;
ext.first_not_executed = log_entry_to_execute;
/// Pool size must be 1 (to avoid reordering of log entries)
constexpr size_t pool_size = 1;
ddl_worker = std::make_unique<DDLWorker>(pool_size, zookeeper_path + "/log", global_context, nullptr, "",
std::make_optional<DatabaseReplicatedExtensions>(std::move(ext)));
2020-10-27 09:19:45 +00:00
}
2020-11-13 18:35:45 +00:00
2020-10-26 15:12:16 +00:00
void DatabaseReplicated::removeOutdatedSnapshotsAndLog()
2020-10-22 15:08:00 +00:00
{
/// This method removes all snapshots and logged queries
/// that no longer will be in use by current replicas or
/// new coming ones.
/// Each registered replica has its state in ZooKeeper.
/// Therefore, snapshots and logged queries that are less
/// than a least advanced replica are removed.
/// It does not interfere with a new coming replica
/// metadata loading from snapshot
/// because the replica will use the latest snapshot available
/// and this snapshot will set the last executed log query
/// to a greater one than the least advanced current replica.
2020-11-13 18:35:45 +00:00
auto current_zookeeper = getZooKeeper();
2020-06-20 15:39:58 +00:00
Strings replica_states = current_zookeeper->getChildren(zookeeper_path + "/replicas");
2020-10-27 09:19:45 +00:00
//TODO do not use log pointers to determine which entries to remove if there are staled pointers.
// We can just remove all entries older than previous snapshot version.
// Possible invariant: store all entries since last snapshot, replica becomes lost when it cannot get log entry.
2020-06-20 15:39:58 +00:00
auto least_advanced = std::min_element(replica_states.begin(), replica_states.end());
Strings snapshots = current_zookeeper->getChildren(zookeeper_path + "/snapshots");
2020-10-22 15:08:00 +00:00
if (snapshots.size() < 2)
{
2020-06-20 15:39:58 +00:00
return;
}
std::sort(snapshots.begin(), snapshots.end());
auto still_useful = std::lower_bound(snapshots.begin(), snapshots.end(), *least_advanced);
snapshots.erase(still_useful, snapshots.end());
2020-10-22 15:08:00 +00:00
for (const String & snapshot : snapshots)
{
2020-06-20 15:39:58 +00:00
current_zookeeper->tryRemoveRecursive(zookeeper_path + "/snapshots/" + snapshot);
}
Strings log_entry_names = current_zookeeper->getChildren(zookeeper_path + "/log");
std::sort(log_entry_names.begin(), log_entry_names.end());
auto still_useful_log = std::upper_bound(log_entry_names.begin(), log_entry_names.end(), *still_useful);
log_entry_names.erase(still_useful_log, log_entry_names.end());
2020-10-22 15:08:00 +00:00
for (const String & log_entry_name : log_entry_names)
{
2020-06-20 15:39:58 +00:00
String log_entry_path = zookeeper_path + "/log/" + log_entry_name;
current_zookeeper->tryRemove(log_entry_path);
}
2020-05-11 12:55:17 +00:00
}
2020-10-22 15:08:00 +00:00
void DatabaseReplicated::runBackgroundLogExecutor()
{
2020-10-26 15:12:16 +00:00
if (last_executed_log_entry.empty())
2020-10-22 15:08:00 +00:00
{
2020-06-20 15:39:58 +00:00
loadMetadataFromSnapshot();
}
2020-11-13 18:35:45 +00:00
auto current_zookeeper = getZooKeeper();
Strings log_entry_names = current_zookeeper->getChildren(zookeeper_path + "/log");
std::sort(log_entry_names.begin(), log_entry_names.end());
auto newest_entry_it = std::upper_bound(log_entry_names.begin(), log_entry_names.end(), last_executed_log_entry);
log_entry_names.erase(log_entry_names.begin(), newest_entry_it);
2020-10-22 15:08:00 +00:00
for (const String & log_entry_name : log_entry_names)
{
2020-11-13 18:35:45 +00:00
//executeLogName(log_entry_name);
last_executed_log_entry = log_entry_name;
writeLastExecutedToDiskAndZK();
2020-06-20 15:39:58 +00:00
int log_n = parse<int>(log_entry_name.substr(4));
int last_log_n = parse<int>(log_entry_names.back().substr(4));
2020-10-22 15:08:00 +00:00
/// The third condition gurantees at most one snapshot creation per batch
if (log_n > 0 && snapshot_period > 0 && (last_log_n - log_n) / snapshot_period == 0 && log_n % snapshot_period == 0)
{
2020-06-20 15:39:58 +00:00
createSnapshot();
}
2020-05-24 17:13:53 +00:00
}
//background_log_executor->scheduleAfter(500);
2020-05-11 12:55:17 +00:00
}
2020-10-22 15:08:00 +00:00
void DatabaseReplicated::writeLastExecutedToDiskAndZK()
{
2020-11-13 18:35:45 +00:00
auto current_zookeeper = getZooKeeper();
2020-10-22 15:08:00 +00:00
current_zookeeper->createOrUpdate(
zookeeper_path + "/replicas/" + replica_name, last_executed_log_entry, zkutil::CreateMode::Persistent);
2020-05-27 18:40:00 +00:00
String metadata_file = getMetadataPath() + ".last_entry";
2020-06-20 15:39:58 +00:00
WriteBufferFromFile out(metadata_file, last_executed_log_entry.size(), O_WRONLY | O_CREAT);
writeString(last_executed_log_entry, out);
out.next();
if (global_context.getSettingsRef().fsync_metadata)
out.sync();
out.close();
}
2020-05-11 12:55:17 +00:00
BlockIO DatabaseReplicated::propose(const ASTPtr & query)
2020-10-22 15:08:00 +00:00
{
if (const auto * query_alter = query->as<ASTAlterQuery>())
2020-06-27 13:39:41 +00:00
{
for (const auto & command : query_alter->command_list->commands)
{
//FIXME allow all types of queries (maybe we should execute ATTACH an similar queries on leader)
if (!isSupportedAlterType(command->type))
throw Exception("Unsupported type of ALTER query", ErrorCodes::NOT_IMPLEMENTED);
}
2020-06-27 13:39:41 +00:00
}
LOG_DEBUG(log, "Proposing query: {}", queryToString(query));
DDLLogEntry entry;
entry.hosts = {};
entry.query = queryToString(query);
entry.initiator = ddl_worker->getCommonHostID();
String node_path = ddl_worker->enqueueQuery(entry);
BlockIO io;
//FIXME use query context
if (global_context.getSettingsRef().distributed_ddl_task_timeout == 0)
return io;
2020-11-13 18:35:45 +00:00
//FIXME need list of all replicas, we can obtain it from zk
Strings hosts_to_wait;
2020-11-13 18:35:45 +00:00
hosts_to_wait.emplace_back(shard_name + '/' +replica_name);
auto stream = std::make_shared<DDLQueryStatusInputStream>(node_path, entry, global_context);
io.in = std::move(stream);
return io;
}
2020-10-22 15:08:00 +00:00
void DatabaseReplicated::createSnapshot()
{
2020-11-13 18:35:45 +00:00
auto current_zookeeper = getZooKeeper();
2020-06-20 15:39:58 +00:00
String snapshot_path = zookeeper_path + "/snapshots/" + last_executed_log_entry;
2020-10-22 15:08:00 +00:00
if (Coordination::Error::ZNODEEXISTS == current_zookeeper->tryCreate(snapshot_path, String(), zkutil::CreateMode::Persistent))
{
2020-06-20 15:39:58 +00:00
return;
}
2020-10-22 15:08:00 +00:00
for (auto iterator = getTablesIterator(global_context, {}); iterator->isValid(); iterator->next())
{
String table_name = iterator->name();
auto query = getCreateQueryFromMetadata(getObjectMetadataPath(table_name), true);
String statement = queryToString(query);
2020-06-26 14:05:27 +00:00
current_zookeeper->create(snapshot_path + "/" + table_name, statement, zkutil::CreateMode::Persistent);
}
2020-06-26 14:05:27 +00:00
current_zookeeper->create(snapshot_path + "/.completed", String(), zkutil::CreateMode::Persistent);
2020-06-20 15:39:58 +00:00
2020-10-26 15:12:16 +00:00
removeOutdatedSnapshotsAndLog();
2020-05-11 12:55:17 +00:00
}
2020-04-05 12:18:51 +00:00
2020-10-22 15:08:00 +00:00
void DatabaseReplicated::loadMetadataFromSnapshot()
{
/// Executes the latest snapshot.
/// Used by new replicas only.
2020-11-13 18:35:45 +00:00
auto current_zookeeper = getZooKeeper();
2020-05-24 17:13:53 +00:00
2020-06-20 15:39:58 +00:00
Strings snapshots;
2020-06-27 13:39:41 +00:00
if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots", snapshots) != Coordination::Error::ZOK)
2020-06-20 15:39:58 +00:00
return;
2020-06-24 12:45:42 +00:00
auto latest_snapshot = std::max_element(snapshots.begin(), snapshots.end());
2020-10-22 15:08:00 +00:00
while (snapshots.size() > 0 && !current_zookeeper->exists(zookeeper_path + "/snapshots/" + *latest_snapshot + "/.completed"))
{
2020-06-24 12:45:42 +00:00
snapshots.erase(latest_snapshot);
latest_snapshot = std::max_element(snapshots.begin(), snapshots.end());
}
2020-10-22 15:08:00 +00:00
if (snapshots.size() < 1)
{
2020-06-20 15:39:58 +00:00
return;
}
2020-05-24 17:13:53 +00:00
Strings metadatas;
2020-06-27 13:39:41 +00:00
if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots/" + *latest_snapshot, metadatas) != Coordination::Error::ZOK)
2020-05-24 17:13:53 +00:00
return;
2020-06-27 13:39:41 +00:00
LOG_DEBUG(log, "Executing {} snapshot", *latest_snapshot);
2020-10-22 15:08:00 +00:00
for (auto t = metadatas.begin(); t != metadatas.end(); ++t)
{
2020-06-20 15:39:58 +00:00
String path = zookeeper_path + "/snapshots/" + *latest_snapshot + "/" + *t;
String query_to_execute = current_zookeeper->get(path, {}, nullptr);
auto current_context = std::make_unique<Context>(global_context);
current_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY;
current_context->setCurrentDatabase(database_name);
current_context->setCurrentQueryId(""); // generate random query_id
executeQuery(query_to_execute, *current_context);
2020-05-24 17:13:53 +00:00
}
2020-06-20 15:39:58 +00:00
last_executed_log_entry = *latest_snapshot;
writeLastExecutedToDiskAndZK();
2020-06-20 15:39:58 +00:00
}
void DatabaseReplicated::drop(const Context & context_)
{
2020-11-13 18:35:45 +00:00
auto current_zookeeper = getZooKeeper();
2020-06-20 15:39:58 +00:00
current_zookeeper->tryRemove(zookeeper_path + "/replicas/" + replica_name);
DatabaseAtomic::drop(context_);
2020-05-24 17:13:53 +00:00
}
2020-11-13 18:35:45 +00:00
void DatabaseReplicated::shutdown()
{
if (ddl_worker)
{
ddl_worker->shutdown();
ddl_worker = nullptr;
}
DatabaseAtomic::shutdown();
}
2020-04-05 12:18:51 +00:00
}